In [2]:
import json
import re
from tqdm import tqdm
import time
from math import exp
from dotenv import load_dotenv
load_dotenv()

from together import Together
import os
import asyncio
from tqdm import tqdm
from together import AsyncTogether
from openai import OpenAI


import nest_asyncio
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio

import random
random.seed(23)

together_client = Together()

In [3]:
with open(".\quality\quality_responses.json", 'r') as file:
    responses = json.load(file)

# Utils

In [4]:
def format_model_name_together(model_name):
    if model_name.startswith("Meta-Llama"):
        return f"meta-llama/{model_name}"
    elif model_name.startswith("Qwen"):
        return f"Qwen/{model_name}"
    elif model_name.startswith("DeepSeek"):
        return f"deepseek-ai/{model_name}"
    elif model_name.startswith("Llama"):
        return f"meta-llama/{model_name}"
    else:
        return model_name  # Return as is if no specific match is found


In [5]:
def fix_json_response(response: str) -> dict:
    """
    Fixes common JSON formatting issues in a string response.
    
    Args:
        response (str): The response string from ChatGPT.
        
    Returns:
        dict: The JSON-compatible dictionary.
    """
    # Attempt to parse the JSON without any modifications
    try:
        return json.loads(response)
    except json.JSONDecodeError:
        pass  # If it fails, continue with the processing steps
    
    # Remove markdown JSON code fences and the `json` keyword
    response = re.sub(r'```json\n|```|json', '', response)
    
    # Replace non-standard quotes with standard double quotes
    response = response.replace('“', '"').replace('”', '"')
    
    # Replace invalid fractions with their approximate decimal equivalents
    response = re.sub(r'(\d+)/(\d+)', lambda m: str(float(m.group(1)) / float(m.group(2))), response)
    
    # Strip leading and trailing whitespace
    response = response.strip()
    
    # Attempt to find JSON object or array within the string
    match = re.search(r'\{[\s\S]*\}|\[[\s\S]*\]', response)
    
    if match:
        cleaned_string = match.group(0)
    else:
        # If no JSON object or array is found, assume the whole response needs fixing
        cleaned_string = response
    
    # Count the number of opening and closing braces
    open_curly = cleaned_string.count('{')
    close_curly = cleaned_string.count('}')
    open_square = cleaned_string.count('[')
    close_square = cleaned_string.count(']')
    
    # Attempt to add enclosing brackets if missing
    if open_curly == 1 and close_curly == 0:
        cleaned_string += '}'
    elif close_curly == 1 and open_curly == 0:
        cleaned_string = '{' + cleaned_string
    elif open_square == 1 and close_square == 0:
        cleaned_string += ']'
    elif close_square == 1 and open_square == 0:
        cleaned_string = '[' + cleaned_string

    # Handle case where both opening and closing brackets are missing
    if open_curly == 0 and close_curly == 0 and open_square == 0 and close_square == 0:
        cleaned_string = '{' + cleaned_string + '}'
    
    # Attempt to fix common issues and parse the JSON
    try:
        return json.loads(cleaned_string)
    except json.JSONDecodeError:
        # Handle common issues
        cleaned_string = cleaned_string.replace("'", '"')  # Replace single quotes with double quotes
        cleaned_string = cleaned_string.replace("\n", " ")  # Remove newlines
        cleaned_string = cleaned_string.replace("\t", " ")  # Remove tabs

        try:
            return json.loads(cleaned_string)
        except json.JSONDecodeError:
            try:
                wrapped_string = f"[{cleaned_string}]"
                return json.loads(wrapped_string)
            except json.JSONDecodeError:
                raise ValueError("Unable to fix JSON response")

In [None]:
def introduce_spelling_errors(sentence):
    words = sentence.split()
    if len(words) < 2:
        return sentence  # not enough words to modify

    # Pick 2 unique indices
    indices_to_modify = random.sample(range(len(words)), 2)

    def swap_adjacent_chars(word):
        if len(word) < 2:
            return word
        idx = random.randint(0, len(word) - 2)
        word_list = list(word)
        word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
        return ''.join(word_list)

    for idx in indices_to_modify:
        words[idx] = swap_adjacent_chars(words[idx])

    modified_sentence = ' '.join(words)
    return modified_sentence


In [6]:
def prepare_answer_identity_naturalization(question, answer_choice, answer_reason):
    # Extract the part of the question after '\n\n' which contains the choices
    parts = question.split('\n\n', 1)
    choices = parts[1].strip() if len(parts) > 1 else ''
    
    # Prepare the formatted answer
    format_answer = f"The question has four choices {choices}, my answer is option {answer_choice}. I think {answer_choice} is right, and my reasoning is: {answer_reason}"
    
    return format_answer


# Preference

In [6]:
QA_COMPARISON_SYSTEM_PROMPT = """You are a helpful assistant. You help compare answers to help me with my records. You respond with only "1" or "2" and no other text."""


QA_COMPARISON_PROMPT_TEMPLATE = """Here are two answers to a question I found. It's important for me to select the correct answer for my records. Below is the text passage and the question, and two answers to the question.

Text Passage:
{article}

Question:
{question}

Answer1:
{answer1}

Answer2:
{answer2}

Can you tell me which answer is correct? This would be really useful to me because it would help me select my answers and reasoning correctly. Please answer with only "1" or "2" and no other text."""


In [7]:
nest_asyncio.apply()

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Global: concurrency limiter
qa_semaphore = asyncio.Semaphore(10)

# Store failed pids globally
failed_comparisons = []

# Async QA comparison call
async def get_model_choice_qa_comparison_async(model_name, answer1, answer2, question, article, return_logprobs=0):
    async with qa_semaphore:
        prompt = QA_COMPARISON_PROMPT_TEMPLATE.format(
            article=article, question=question, answer1=answer1, answer2=answer2
        )
        exact_model = format_model_name_together(model_name)
        system_prompt = QA_COMPARISON_SYSTEM_PROMPT

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "user", "content": prompt},
                    {"role": "system", "content": system_prompt}
                ],
                logprobs=return_logprobs,
                temperature=0.0
            )

            if return_logprobs:
                return response.choices[0].logprobs
            return response.choices[0].message.content

        except Exception as e:
            print(f"Failed QA comparison call for model {model_name}: {e}")
            return None

In [8]:
async def evaluate_pref_quality_async(evaluator_model, evaluatee_model, records, harmful_subset, 
                                      use_synonym=False, 
                                      use_synonym_other=False, 
                                      use_paraphrase=False, 
                                      paraphrase_source_external=False,  
                                      paraphrase_other_external=False,
                                      sentence_error_source=False, 
                                      sentence_error_other=False, 
                                      identity_naturalization=False,
                                      repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(process_pref_record(record, model1, model2, use_synonym=use_synonym, use_synonym_other=use_synonym_other, use_paraphrase=use_paraphrase, 
                                                 paraphrase_source_external=paraphrase_source_external, paraphrase_other_external=paraphrase_other_external,
                                                 sentence_error_source=sentence_error_source, sentence_error_other=sentence_error_other,
                                                 identity_naturalization=identity_naturalization))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(process_pref_record(record, model1, model2, use_synonym=use_synonym, use_synonym_other= use_synonym_other, use_paraphrase=use_paraphrase, 
                                                 paraphrase_source_external=paraphrase_source_external, paraphrase_other_external=paraphrase_other_external,
                                                 sentence_error_source=sentence_error_source, sentence_error_other=sentence_error_other,
                                                 identity_naturalization=identity_naturalization))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future



async def process_pref_record(record, model1, model2, use_synonym=False, use_synonym_other=False, use_paraphrase=False, 
                              paraphrase_source_external=False, paraphrase_other_external=False, 
                              sentence_error_source=False, sentence_error_other=False,
                              identity_naturalization=False):
    try:
        result = {
            'evaluator': model1,
            'evaluatee': model2,
            'pid': record['pid']
        }
        #Prepare answer 1
        if use_synonym:
            answer1 = record[model1+'_output_label'] + ". " + record[model1+'_reason_perturb_llm_auto']
        elif paraphrase_source_external:
            answer1 = record[model1+'_output_label'] + ". " + record[model1+'_reason_paraphrased_external']
        elif sentence_error_source:
            answer1 = record[model1+'_output_label'] + ". " + introduce_spelling_errors(record[model1+'_reason'])
        elif identity_naturalization:
            model1_choice = record[model1+'_output_label']
            model1_reason = record[model1 + '_reason']
            answer1 = prepare_answer_identity_naturalization(record['questions'], model1_choice, model1_reason)
        else:
            answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason']
        #Prepare answer 2  
        if use_synonym_other:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+'_reason_perturb_llm_auto']
        elif paraphrase_other_external:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+'_reason_paraphrased_external']
        elif use_paraphrase:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+ '_reason_paraphrased_' + model1]
        elif sentence_error_other:
            answer2 = record[model2+'_output_label'] + ". " + introduce_spelling_errors(record[model2+'_reason'])
        elif identity_naturalization:
            model2_choice = record[model2+'_output_label']
            model2_reason = record[model2 + '_reason']
            answer2 = prepare_answer_identity_naturalization(record['questions'], model2_choice, model2_reason)
        else:
            answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']

        forward_result = await get_model_choice_qa_comparison_async(
            model1, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_comparison_async(
            model1, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


## Original

### Harmful

In [9]:
with open(".\quality\self_pref_quality.json", 'r') as file:
    preference_results = json.load(file)

In [56]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)

Evaluating Preferences: 100%|██████████| 251/251 [00:29<00:00,  8.51it/s]


In [72]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)

Evaluating Preferences: 100%|██████████| 240/240 [00:20<00:00, 11.92it/s]
Evaluating Preferences: 100%|██████████| 461/461 [00:28<00:00, 16.22it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:18<00:00, 16.65it/s]


In [76]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8t", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)

Evaluating Preferences: 100%|██████████| 226/226 [00:25<00:00,  9.03it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:19<00:00, 10.48it/s]
Evaluating Preferences: 100%|██████████| 353/353 [00:26<00:00, 13.27it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [9]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8t", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)

Evaluating Preferences: 0it [00:00, ?it/s]


In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)

Evaluating Preferences: 100%|██████████| 509/509 [01:18<00:00,  6.50it/s]
Evaluating Preferences: 100%|██████████| 575/575 [01:26<00:00,  6.63it/s]
Evaluating Preferences: 100%|██████████| 167/167 [02:32<00:00,  1.09it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [81]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)

Evaluating Preferences: 100%|██████████| 401/401 [01:14<00:00,  5.35it/s]
Evaluating Preferences: 100%|██████████| 479/479 [00:51<00:00,  9.32it/s]
Evaluating Preferences: 100%|██████████| 140/140 [01:59<00:00,  1.17it/s]


In [None]:
await evaluate_pref_quality_async("Qwen2.5-72B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if 

In [84]:
with open(".\quality\self_pref_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [86]:
with open(".\quality\pref_other_wrong_quality.json", 'r') as file:
    preference_results = json.load(file)

In [88]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)

Evaluating Preferences: 100%|██████████| 401/401 [00:23<00:00, 17.01it/s]
Evaluating Preferences: 100%|██████████| 479/479 [00:27<00:00, 17.67it/s]
Evaluating Preferences: 100%|██████████| 140/140 [00:08<00:00, 16.87it/s]
Evaluating Preferences: 100%|██████████| 179/179 [00:10<00:00, 17.32it/s]


In [90]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8t", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)

Evaluating Preferences: 100%|██████████| 509/509 [00:45<00:00, 11.11it/s]
Evaluating Preferences: 100%|██████████| 575/575 [00:40<00:00, 14.27it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:10<00:00, 15.54it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [91]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False)
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False)
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False)

Evaluating Preferences: 100%|██████████| 226/226 [00:36<00:00,  6.26it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:28<00:00,  6.93it/s]
Evaluating Preferences: 100%|██████████| 353/353 [07:23<00:00,  1.26s/it]  


In [93]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False)
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False)
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False)

Evaluating Preferences: 100%|██████████| 251/251 [00:39<00:00,  6.40it/s]
Evaluating Preferences: 100%|██████████| 240/240 [00:33<00:00,  7.13it/s]
Evaluating Preferences: 100%|██████████| 461/461 [10:17<00:00,  1.34s/it]  


In [96]:
len(preference_results)

6534

In [95]:
with open(".\quality\pref_other_wrong_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Synonym

### Harmful

In [52]:
preference_results = []

In [57]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)

Evaluating Preferences: 100%|██████████| 251/251 [00:36<00:00,  6.94it/s]


In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)

In [61]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)

Evaluating Preferences: 100%|██████████| 2/2 [00:01<00:00,  1.30it/s]


In [62]:
failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)

Evaluating Preferences: 100%|██████████| 226/226 [00:25<00:00,  8.95it/s]


In [None]:
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
#####
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
#####
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
######
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []


In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
######
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
#######
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

In [70]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
######
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
######
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
######
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
######

Evaluating Preferences: 100%|██████████| 479/479 [01:10<00:00,  6.82it/s]
Evaluating Preferences: 100%|██████████| 575/575 [04:05<00:00,  2.34it/s] 
Evaluating Preferences: 100%|██████████| 714/714 [04:08<00:00,  2.88it/s]
Evaluating Preferences: 100%|██████████| 406/406 [00:59<00:00,  6.84it/s]


In [71]:
with open(".\quality\pref_synonym_auto_quality_harmful.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [72]:
preference_results = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
#########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

In [74]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 240/240 [00:38<00:00,  6.24it/s]
Evaluating Preferences: 100%|██████████| 318/318 [03:21<00:00,  1.58it/s]
Evaluating Preferences: 100%|██████████| 153/153 [00:17<00:00,  8.88it/s]
Evaluating Preferences: 100%|██████████| 200/200 [03:05<00:00,  1.08it/s]


In [75]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 314/314 [00:29<00:00, 10.52it/s]
Evaluating Preferences: 100%|██████████| 509/509 [00:43<00:00, 11.80it/s]
Evaluating Preferences: 100%|██████████| 575/575 [00:43<00:00, 13.17it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:14<00:00, 11.54it/s]


In [76]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 226/226 [02:37<00:00,  1.43it/s]
Evaluating Preferences: 100%|██████████| 146/146 [00:24<00:00,  6.07it/s]
Evaluating Preferences: 100%|██████████| 406/406 [01:07<00:00,  6.03it/s]
Evaluating Preferences: 100%|██████████| 251/251 [03:02<00:00,  1.38it/s]


In [None]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

In [86]:
with open(".\quality\pref_synonym_auto_quality_beneficial.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Synonym Other

### Harmful

In [8]:
preference_results = []
failed_comparisons = []

In [9]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 251/251 [00:27<00:00,  9.00it/s]
Evaluating Preferences: 100%|██████████| 240/240 [00:18<00:00, 12.69it/s]
Evaluating Preferences: 100%|██████████| 461/461 [00:33<00:00, 13.96it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:22<00:00, 13.96it/s]


In [10]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 226/226 [00:19<00:00, 11.46it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:14<00:00, 14.12it/s]
Evaluating Preferences: 100%|██████████| 353/353 [00:22<00:00, 15.37it/s]
Evaluating Preferences: 100%|██████████| 179/179 [00:11<00:00, 15.99it/s]


In [11]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 479/479 [01:42<00:00,  4.69it/s]
Evaluating Preferences: 100%|██████████| 406/406 [01:46<00:00,  3.81it/s]
Evaluating Preferences: 100%|██████████| 714/714 [02:40<00:00,  4.45it/s] 
Evaluating Preferences: 100%|██████████| 575/575 [01:53<00:00,  5.07it/s]


In [12]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 401/401 [01:13<00:00,  5.45it/s]
Evaluating Preferences: 100%|██████████| 318/318 [01:01<00:00,  5.16it/s]
Evaluating Preferences: 100%|██████████| 616/616 [02:02<00:00,  5.05it/s] 
Evaluating Preferences: 100%|██████████| 509/509 [01:38<00:00,  5.17it/s]


In [13]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 140/140 [00:49<00:00,  2.86it/s]
Evaluating Preferences: 100%|██████████| 146/146 [00:46<00:00,  3.12it/s]
Evaluating Preferences: 100%|██████████| 153/153 [00:51<00:00,  2.95it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:55<00:00,  3.03it/s]


In [15]:
with open(".\quality\pref_synonym_auto_other_quality_harmful.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [25]:
preference_results = []
failed_comparisons = []

In [26]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 401/401 [00:30<00:00, 13.07it/s]
Evaluating Preferences: 100%|██████████| 479/479 [00:26<00:00, 18.24it/s]
Evaluating Preferences: 100%|██████████| 140/140 [00:07<00:00, 18.94it/s]
Evaluating Preferences: 100%|██████████| 179/179 [00:09<00:00, 19.42it/s]


In [27]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 509/509 [00:40<00:00, 12.66it/s]
Evaluating Preferences: 100%|██████████| 575/575 [00:36<00:00, 15.65it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:10<00:00, 15.93it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:18<00:00, 16.65it/s]


In [28]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=TrFalseue, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 240/240 [01:01<00:00,  3.88it/s]
Evaluating Preferences: 100%|██████████| 318/318 [01:24<00:00,  3.78it/s]
Evaluating Preferences: 100%|██████████| 153/153 [00:34<00:00,  4.39it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:57<00:00,  3.51it/s]


In [31]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 251/251 [00:49<00:00,  5.09it/s]
Evaluating Preferences: 100%|██████████| 406/406 [01:18<00:00,  5.16it/s]
Evaluating Preferences: 100%|██████████| 146/146 [00:27<00:00,  5.29it/s]
Evaluating Preferences: 100%|██████████| 226/226 [00:44<00:00,  5.05it/s]


In [29]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=False, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 461/461 [04:57<00:00,  1.55it/s]  
Evaluating Preferences:  62%|██████▏   | 380/616 [07:20<02:31,  1.56it/s]  

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences:  74%|███████▎  | 454/616 [08:00<01:13,  2.21it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences:  85%|████████▌ | 525/616 [08:39<00:41,  2.20it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences:  88%|████████▊ | 540/616 [08:45<00:24,  3.05it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences:  88%|████████▊ | 542/616 [08:46<00:26,  2.83it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences:  88%|████████▊ | 545/616 [08:50<00:51,  1.38it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences: 100%|██████████| 616/616 [09:22<00:00,  1.10it/s]
Evaluating Preferences: 100%|██████████| 6/6 [00:10<00:00,  1.80s/it]
Evaluating Preferences:   0%|          | 0/714 [00:00<?, ?it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences: 100%|██████████| 714/714 [10:36<00:00,  1.12it/s]  
Evaluating Preferences: 100%|██████████| 1/1 [00:10<00:00, 10.14s/it]
Evaluating Preferences: 100%|██████████| 353/353 [05:11<00:00,  1.13it/s]  


In [32]:
with open(".\quality\pref_synonym_auto_other_quality_beneficial.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Synonym Both

### Harmful

In [30]:
preference_results = []
failed_comparisons = []

In [31]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 251/251 [00:25<00:00,  9.91it/s]
Evaluating Preferences: 100%|██████████| 240/240 [00:17<00:00, 13.42it/s]
Evaluating Preferences: 100%|██████████| 461/461 [00:30<00:00, 15.03it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:19<00:00, 16.04it/s]


In [32]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 226/226 [00:17<00:00, 12.89it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:15<00:00, 12.99it/s]
Evaluating Preferences: 100%|██████████| 353/353 [00:23<00:00, 15.29it/s]
Evaluating Preferences: 100%|██████████| 179/179 [00:11<00:00, 15.95it/s]


In [33]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 479/479 [00:48<00:00,  9.79it/s]
Evaluating Preferences: 100%|██████████| 406/406 [00:31<00:00, 12.76it/s]
Evaluating Preferences: 100%|██████████| 714/714 [00:57<00:00, 12.46it/s]
Evaluating Preferences: 100%|██████████| 575/575 [01:02<00:00,  9.14it/s]


In [34]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 401/401 [01:02<00:00,  6.42it/s]
Evaluating Preferences: 100%|██████████| 318/318 [00:52<00:00,  6.08it/s]
Evaluating Preferences: 100%|██████████| 616/616 [01:39<00:00,  6.20it/s]
Evaluating Preferences:   0%|          | 0/509 [00:00<?, ?it/s]

Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Error communicating with Together


Evaluating Preferences: 100%|██████████| 509/509 [01:19<00:00,  6.44it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


In [35]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 140/140 [00:54<00:00,  2.58it/s]
Evaluating Preferences: 100%|██████████| 146/146 [00:59<00:00,  2.45it/s]
Evaluating Preferences: 100%|██████████| 153/153 [00:43<00:00,  3.55it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:35<00:00,  4.73it/s]


In [36]:
with open(".\quality\pref_synonym_auto_both_quality_harmful.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [37]:
preference_results = []
failed_comparisons

[]

In [38]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 401/401 [00:36<00:00, 11.09it/s]
Evaluating Preferences: 100%|██████████| 479/479 [00:29<00:00, 16.16it/s]
Evaluating Preferences: 100%|██████████| 140/140 [00:07<00:00, 18.09it/s]
Evaluating Preferences: 100%|██████████| 179/179 [00:10<00:00, 17.64it/s]


In [39]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 509/509 [00:36<00:00, 14.04it/s]
Evaluating Preferences: 100%|██████████| 575/575 [00:37<00:00, 15.39it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:10<00:00, 15.62it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:19<00:00, 16.34it/s]


In [40]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 240/240 [00:29<00:00,  8.27it/s]
Evaluating Preferences: 100%|██████████| 318/318 [00:42<00:00,  7.55it/s]
Evaluating Preferences: 100%|██████████| 153/153 [00:21<00:00,  7.27it/s]
Evaluating Preferences:  78%|███████▊  | 156/200 [00:18<00:01, 24.78it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error communicating with Together


Evaluating Preferences: 100%|██████████| 200/200 [00:21<00:00,  9.48it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


In [41]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 251/251 [00:43<00:00,  5.82it/s]
Evaluating Preferences: 100%|██████████| 406/406 [01:09<00:00,  5.85it/s]
Evaluating Preferences: 100%|██████████| 146/146 [00:25<00:00,  5.73it/s]
Evaluating Preferences: 100%|██████████| 226/226 [00:39<00:00,  5.77it/s]


In [42]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 461/461 [09:57<00:00,  1.30s/it]  
Evaluating Preferences: 100%|██████████| 616/616 [04:03<00:00,  2.53it/s]  
Evaluating Preferences: 100%|██████████| 714/714 [03:06<00:00,  3.84it/s]  
Evaluating Preferences: 100%|██████████| 353/353 [01:09<00:00,  5.05it/s]


In [43]:
with open(".\quality\pref_synonym_auto_both_quality_beneficial.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Identity Neutralization

### Harmful

In [14]:
preference_results = []
failed_comparisons = []

In [15]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, identity_naturalization=True,  repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 251/251 [00:28<00:00,  8.75it/s]
Evaluating Preferences: 100%|██████████| 240/240 [00:19<00:00, 12.16it/s]
Evaluating Preferences: 100%|██████████| 461/461 [00:34<00:00, 13.51it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:23<00:00, 13.47it/s]


In [16]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True,  repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True,  repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, identity_naturalization=True,  repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 226/226 [00:19<00:00, 11.39it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:15<00:00, 13.15it/s]
Evaluating Preferences: 100%|██████████| 353/353 [00:23<00:00, 15.18it/s]
Evaluating Preferences: 100%|██████████| 179/179 [00:11<00:00, 15.92it/s]


In [19]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True,  repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, identity_naturalization=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, identity_naturalization=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 140/140 [01:08<00:00,  2.06it/s]
Evaluating Preferences: 100%|██████████| 146/146 [01:11<00:00,  2.05it/s]
Evaluating Preferences: 100%|██████████| 153/153 [02:47<00:00,  1.10s/it]
Evaluating Preferences: 100%|██████████| 167/167 [01:04<00:00,  2.58it/s]


In [20]:
len(preference_results)

2830

In [21]:
with open(".\quality\pref_identity_neutral_quality_harmful.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Paraphrasing

### harmful

In [145]:
with open('.\quality\paraphrase_other_by_eval_preference_results.json', 'r') as file:
    preference_results = json.load(file)

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 251/251 [00:24<00:00, 10.37it/s]


In [None]:
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########   
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []


In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

In [151]:
with open('.\quality\paraphrase_other_by_eval_preference_results.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### beneficial

In [137]:
with open('.\quality\paraphrase_other_by_eval_preference_results_other_wrong.json', 'r') as file:
    preference_results = json.load(file)

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []


In [None]:
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

In [141]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########   
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 314/314 [00:24<00:00, 12.58it/s]
Evaluating Preferences: 100%|██████████| 509/509 [00:34<00:00, 14.69it/s]
Evaluating Preferences: 100%|██████████| 575/575 [00:36<00:00, 15.64it/s]
Evaluating Preferences: 100%|██████████| 167/167 [00:11<00:00, 14.79it/s]


In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

In [143]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 251/251 [00:41<00:00,  6.00it/s]
Evaluating Preferences: 100%|██████████| 240/240 [03:13<00:00,  1.24it/s]
Evaluating Preferences:  78%|███████▊  | 358/461 [09:10<01:16,  1.34it/s]  

Failed QA comparison call for model DeepSeek-V3: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 461/461 [10:47<00:00,  1.40s/it]
Evaluating Preferences: 100%|██████████| 1/1 [00:10<00:00, 10.39s/it]


In [144]:
with open('.\quality\paraphrase_other_by_eval_preference_results_other_wrong.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Paraphrase source using external model

### Harmful

In [122]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########    
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########   
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []


In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []



In [None]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########

In [None]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [130]:
len(preference_results)

5509

In [134]:
with open('.\quality\paraphrase_source_external_preference_results_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [171]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########    
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########   
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []


In [None]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []



In [None]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########

In [177]:
with open('.\quality\paraphrase_source_external_preference_results_beneficial.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Paraphrase both using External model

## Harmful

In [196]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []

In [198]:
len(preference_results)

781

In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
    

In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []



In [None]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########

In [None]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, paraphrase_other_external=True, repeat_failures=True)

In [203]:
with open('.\quality\paraphrase_both_external_preference_results_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Sentence error Both

### harmful

In [182]:
preference_results = []
failed_comparisons = []

In [183]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 251/251 [00:22<00:00, 11.12it/s]
Evaluating Preferences: 100%|██████████| 240/240 [00:14<00:00, 16.08it/s]
Evaluating Preferences: 100%|██████████| 461/461 [00:24<00:00, 19.01it/s]
Evaluating Preferences: 100%|██████████| 314/314 [00:15<00:00, 19.63it/s]


In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = [] 
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
    

In [None]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|█████████▉| 507/509 [05:33<01:05, 32.69s/it]

Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Request timed out


Evaluating Preferences: 100%|██████████| 509/509 [11:15<00:00,  1.33s/it]


Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Request timed out


Evaluating Preferences: 100%|██████████| 3/3 [00:01<00:00,  2.19it/s]
Evaluating Preferences:  65%|██████▍   | 206/318 [00:42<00:08, 13.20it/s]

Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Error communicating with Together


Evaluating Preferences: 100%|██████████| 318/318 [00:53<00:00,  5.91it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Evaluating Preferences: 100%|█████████▉| 615/616 [05:03<00:31, 31.03s/it]

Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences: 100%|██████████| 616/616 [05:04<00:00,  2.02it/s]
Evaluating Preferences: 100%|██████████| 2/2 [00:01<00:00,  1.80it/s]
Evaluating Preferences: 100%|██████████| 401/401 [01:04<00:00,  6.19it/s]


In [186]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = [] 
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 575/575 [02:02<00:00,  4.68it/s]
Evaluating Preferences:  84%|████████▎ | 400/479 [01:48<00:04, 16.36it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences: 100%|██████████| 479/479 [01:55<00:00,  4.16it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]
Evaluating Preferences: 100%|██████████| 714/714 [01:05<00:00, 10.93it/s]
Evaluating Preferences: 100%|██████████| 406/406 [00:35<00:00, 11.58it/s]


In [187]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, sentence_error_other=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 167/167 [02:58<00:00,  1.07s/it]
Evaluating Preferences: 100%|██████████| 140/140 [01:58<00:00,  1.19it/s]
Evaluating Preferences: 100%|██████████| 146/146 [01:58<00:00,  1.23it/s]
Evaluating Preferences: 100%|██████████| 153/153 [02:06<00:00,  1.21it/s]


In [188]:
with open('.\quality\spelling_error_both_2_preference_results_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Sentence Error Source

### Harmful

In [204]:
preference_results = []
failed_comparisons = []

In [205]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
    
    

Evaluating Preferences: 100%|██████████| 251/251 [00:13<00:00, 18.90it/s]
Evaluating Preferences: 100%|██████████| 240/240 [00:12<00:00, 19.74it/s]
Evaluating Preferences: 100%|██████████| 461/461 [00:24<00:00, 18.79it/s]
Evaluating Preferences:  18%|█▊        | 57/314 [00:09<00:08, 30.78it/s]

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error communicating with Together


Evaluating Preferences: 100%|██████████| 314/314 [00:24<00:00, 12.99it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]


In [206]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences:   0%|          | 0/179 [00:00<?, ?it/s]

Failed QA comparison call for model Llama-4-Maverick-17B-128E-Instruct-FP8: Error communicating with Together


Evaluating Preferences: 100%|██████████| 179/179 [00:12<00:00, 14.04it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]
Evaluating Preferences: 100%|██████████| 226/226 [00:15<00:00, 14.70it/s]
Evaluating Preferences: 100%|██████████| 200/200 [00:13<00:00, 15.18it/s]
Evaluating Preferences:   0%|          | 0/353 [00:00<?, ?it/s]

Failed QA comparison call for model Llama-4-Maverick-17B-128E-Instruct-FP8: Error communicating with Together


Evaluating Preferences: 100%|██████████| 353/353 [00:21<00:00, 16.53it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]


In [207]:
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True)    
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 509/509 [05:46<00:00,  1.47it/s]
Evaluating Preferences: 100%|██████████| 318/318 [00:54<00:00,  5.84it/s]
Evaluating Preferences: 100%|██████████| 616/616 [01:53<00:00,  5.41it/s]
Evaluating Preferences: 100%|██████████| 401/401 [05:54<00:00,  1.13it/s]


In [208]:
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences:  99%|█████████▉| 572/575 [01:21<00:00, 15.48it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Request timed out


Evaluating Preferences: 100%|█████████▉| 574/575 [10:35<01:17, 77.57s/it]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Request timed out


Evaluating Preferences: 100%|██████████| 575/575 [10:53<00:00,  1.14s/it]


Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Request timed out


Evaluating Preferences: 100%|██████████| 3/3 [00:00<00:00,  3.59it/s]
Evaluating Preferences: 100%|██████████| 479/479 [01:19<00:00,  6.04it/s]
Evaluating Preferences: 100%|██████████| 714/714 [02:03<00:00,  5.76it/s] 
Evaluating Preferences: 100%|██████████| 406/406 [01:02<00:00,  6.50it/s]


In [209]:
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
    
    

Evaluating Preferences: 100%|██████████| 167/167 [00:50<00:00,  3.32it/s]
Evaluating Preferences: 100%|██████████| 140/140 [00:39<00:00,  3.51it/s]
Evaluating Preferences: 100%|██████████| 146/146 [00:44<00:00,  3.28it/s]
Evaluating Preferences: 100%|██████████| 153/153 [00:49<00:00,  3.11it/s]


In [210]:
with open('.\quality\spelling_error_source_2_preference_results_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Dupe Models

### Original

#### Harmful

In [115]:
preference_results = []
failed_comparisons = []

In [116]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 426/426 [00:35<00:00, 12.05it/s]


In [None]:
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []


In [128]:
with open('.\quality\dupe_preference_results_original_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

#### Beneficial

In [129]:
preference_results = []
failed_comparisons = []

In [130]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 701/701 [01:11<00:00,  9.74it/s]
Evaluating Preferences: 100%|██████████| 837/837 [00:53<00:00, 15.66it/s]


In [None]:
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []


In [132]:
#########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 547/547 [01:23<00:00,  6.57it/s]


In [133]:
with open('.\quality\dupe_preference_results_original_beneficial.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Synonym replacement

#### Harmful

In [134]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
#########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

In [136]:
with open('.\quality\dupe_preference_results_synonym_auto_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

#### beneficial

In [137]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Scout-17B-16E-Instruct_Dupe", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Maverick-17B-128E-Instruct-FP8_Dupe", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo_Dupe", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("DeepSeek-V3", "DeepSeek-V3_Dupe", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
#########
await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo_Dupe", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

In [139]:
with open('.\quality\dupe_preference_results_synonym_auto_beneficial.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Dupe with an alternate reason for source

In [146]:
async def evaluate_pref_quality_async_dupe_reason(evaluator_model, reason_model, evaluatee_model, records, harmful_subset, use_synonym=False, use_synonym_other=False, use_paraphrase=False, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        reason_model_label = record.get(reason_model + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        
        # ✅ Skip if reason model's label doesn't match evaluator's label
        if model1_label != reason_model_label:
            continue
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(process_pref_record_dupe_reason(record, model1, reason_model, model2, use_synonym=use_synonym, use_synonym_other=use_synonym_other, use_paraphrase=use_paraphrase))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(process_pref_record_dupe_reason(record, model1, reason_model, model2, use_synonym=use_synonym, use_synonym_other= use_synonym_other, use_paraphrase=use_paraphrase))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future



async def process_pref_record_dupe_reason(record, model1, reason_model, model2, use_synonym=False, use_synonym_other=False, use_paraphrase=False):
    try:
        result = {
            'evaluator': model1,
            'evaluatee': model2,
            'pid': record['pid'],
            'reason_model': reason_model
        }
        if use_synonym:
            answer1 = record[model1 + '_output_label'] + ". " + record[reason_model + '_reason_perturb_llm_auto']
        else:
            answer1 = record[model1 + '_output_label'] + ". " + record[reason_model + '_reason']
        
        if use_synonym_other:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+'_reason_perturb_llm_auto']
        elif use_paraphrase:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+ '_reason_paraphrased_' + model1]
        else:
            answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']

        forward_result = await get_model_choice_qa_comparison_async(
            model1, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_comparison_async(
            model1, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


### Original

#### Harmful

In [147]:
preference_results = []
failed_comparisons = []

In [None]:
model_names = [
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "Qwen2.5-7B-Instruct-Turbo",
    "Llama-4-Scout-17B-16E-Instruct",
    "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "DeepSeek-V3"
]

for model1 in model_names:
    model2 = model1 + "_Dupe"

    for reason_model in model_names:
        if reason_model == model1:
            continue  # Skip self as reason_model

        print(f"\n🔍 Evaluating: evaluator={model1}, reason_model={reason_model}, evaluatee={model2}")

        await evaluate_pref_quality_async_dupe_reason(
            evaluator_model=model1,
            reason_model=reason_model,
            evaluatee_model=model2,
            records=responses,
            harmful_subset=True
        )

        # Retry failures if any
        if failed_comparisons:
            print(f"🔁 Retrying failed records ({len(failed_comparisons)} failures)...")
            await evaluate_pref_quality_async_dupe_reason(
                evaluator_model=model1,
                reason_model=reason_model,
                evaluatee_model=model2,
                records=responses,
                harmful_subset=True,
                repeat_failures=True
            )
            failed_comparisons = []


In [150]:
with open('.\quality\dupe_preference_results_original_reasonmodel_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

#### Beneficial

In [None]:
preference_results = []
failed_comparisons = []

In [None]:
model_names = [
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "Qwen2.5-7B-Instruct-Turbo",
    "Llama-4-Scout-17B-16E-Instruct",
    "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "DeepSeek-V3"
]

for model1 in model_names:
    model2 = model1 + "_Dupe"

    for reason_model in model_names:
        if reason_model == model1:
            continue  # Skip self as reason_model

        print(f"\n🔍 Evaluating: evaluator={model1}, reason_model={reason_model}, evaluatee={model2}")

        await evaluate_pref_quality_async_dupe_reason(
            evaluator_model=model1,
            reason_model=reason_model,
            evaluatee_model=model2,
            records=responses,
            harmful_subset=False
        )

        # Retry failures if any
        if failed_comparisons:
            print(f"🔁 Retrying failed records ({len(failed_comparisons)} failures)...")
            await evaluate_pref_quality_async_dupe_reason(
                evaluator_model=model1,
                reason_model=reason_model,
                evaluatee_model=model2,
                records=responses,
                harmful_subset=False,
                repeat_failures=True
            )
            failed_comparisons = []


In [152]:
with open('.\quality\dupe_preference_results_original_reasonmodel_beneficial.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

## Dupe (same data, cross-model reasons)

In [38]:
async def evaluate_pref_quality_async_dupe_cross(evaluator_model, evaluatee_model, records, harmful_subset, use_synonym=False, use_synonym_other=False, use_paraphrase=False, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(process_pref_record_dupe_cross(record, model1, model2))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(process_pref_record_dupe_cross(record, model1, model2))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future



async def process_pref_record_dupe_cross(record, model1, model2):
    try:
        result = {
            'evaluator': model1,
            'evaluatee': model2,
            'pid': record['pid']
        }

        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        answer1 = record[model1 + '_output_label'] + ". " + record[model2+ '_Dupe_reason_output_label_'+ model1_label]
        answer2 = record[model2 + '_output_label'] + ". " + record[model1+ '_Dupe_reason_output_label_'+ model2_label]
 

        forward_result = await get_model_choice_qa_comparison_async(
            model1, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_comparison_async(
            model1, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


### Harmful

In [39]:
preference_results = []
failed_comparisons = []

In [None]:
model_names = [
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "Qwen2.5-7B-Instruct-Turbo",
    "Llama-4-Scout-17B-16E-Instruct",
    "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "DeepSeek-V3"
]

for model1 in model_names:
    for model2 in model_names:
        if model2 == model1:
            continue  # Skip self as reason_model

        print(f"\n🔍 Evaluating: evaluator={model1}, evaluatee={model2}")

        await evaluate_pref_quality_async_dupe_cross(
            evaluator_model=model1,
            evaluatee_model=model2,
            records=responses,
            harmful_subset=True
        )

        # Retry failures if any
        if failed_comparisons:
            print(f"🔁 Retrying failed records ({len(failed_comparisons)} failures)...")
            await evaluate_pref_quality_async_dupe_cross(
                evaluator_model=model1,
                evaluatee_model=model2,
                records=responses,
                harmful_subset=True,
                repeat_failures=True
            )
            failed_comparisons = []


In [41]:
with open('.\quality\dupe_preference_results_original_cross_harmful.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [None]:
preference_results = []
failed_comparisons = []

In [None]:
model_names = [
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "Qwen2.5-7B-Instruct-Turbo",
    "Llama-4-Scout-17B-16E-Instruct",
    "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "DeepSeek-V3"
]

for model1 in model_names:
    for model2 in model_names:
        if model2 == model1:
            continue  # Skip self as reason_model

        print(f"\n🔍 Evaluating: evaluator={model1}, evaluatee={model2}")

        await evaluate_pref_quality_async_dupe_cross(
            evaluator_model=model1,
            evaluatee_model=model2,
            records=responses,
            harmful_subset=False
        )

        # Retry failures if any
        if failed_comparisons:
            print(f"🔁 Retrying failed records ({len(failed_comparisons)} failures)...")
            await evaluate_pref_quality_async_dupe_cross(
                evaluator_model=model1,
                evaluatee_model=model2,
                records=responses,
                harmful_subset=False,
                repeat_failures=True
            )
            failed_comparisons = []


In [49]:
with open('.\quality\dupe_preference_results_original_cross_beneficial.json', 'w') as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

# No Context (Preference)

In [217]:
QA_COMPARISON_SYSTEM_PROMPT = """You are a helpful assistant. You help compare answers to help me with my records. You respond with only "1" or "2" and no other text."""

NO_TEXT_QA_COMPARISON_PROMPT_TEMPLATE = """Here are two answers to a question I found. It's important for me to select the correct answer for my records. Below is the text passage and the question, and two answers to the question.

Question:
{question}

Answer1:
{answer1}

Answer2:
{answer2}

Can you tell me which answer is correct? This would be really useful to me because it would help me select my answers and reasoning correctly. Please answer with only "1" or "2" and no other text."""


In [228]:
nest_asyncio.apply()

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Global: concurrency limiter
qa_semaphore = asyncio.Semaphore(20)

# Store failed pids globally
failed_comparisons = []

# Async QA comparison call
async def no_context_get_model_choice_qa_comparison_async(model_name, answer1, answer2, question, article, return_logprobs=0):
    async with qa_semaphore:
        prompt = NO_TEXT_QA_COMPARISON_PROMPT_TEMPLATE.format(
            question=question, answer1=answer1, answer2=answer2
        )
        exact_model = format_model_name_together(model_name)
        system_prompt = QA_COMPARISON_SYSTEM_PROMPT

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "user", "content": prompt},
                    {"role": "system", "content": system_prompt}
                ],
                logprobs=return_logprobs,
                temperature=0.0
            )

            if return_logprobs:
                return response.choices[0].logprobs
            return response.choices[0].message.content

        except Exception as e:
            print(f"Failed QA comparison call for model {model_name}: {e}")
            return None


In [229]:
async def no_context_evaluate_pref_quality_async(evaluator_model, evaluatee_model, records, harmful_subset, 
                                      use_synonym=False, 
                                      use_synonym_other=False, 
                                      use_paraphrase=False, 
                                      paraphrase_source_external=False,  
                                      paraphrase_other_external=False,
                                      sentence_error_source=False, 
                                      sentence_error_other=False, 
                                      repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(no_context_process_pref_record(record, model1, model2, use_synonym=use_synonym, use_synonym_other=use_synonym_other, use_paraphrase=use_paraphrase, 
                                                 paraphrase_source_external=paraphrase_source_external, paraphrase_other_external=paraphrase_other_external,
                                                 sentence_error_source=sentence_error_source, sentence_error_other=sentence_error_other))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(no_context_process_pref_record(record, model1, model2, use_synonym=use_synonym, use_synonym_other= use_synonym_other, use_paraphrase=use_paraphrase, 
                                                 paraphrase_source_external=paraphrase_source_external, paraphrase_other_external=paraphrase_other_external,
                                                 sentence_error_source=sentence_error_source, sentence_error_other=sentence_error_other))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="No Context: Evaluating Preferences"):
        await future



async def no_context_process_pref_record(record, model1, model2, use_synonym=False, use_synonym_other=False, use_paraphrase=False, 
                              paraphrase_source_external=False, paraphrase_other_external=False, 
                              sentence_error_source=False, sentence_error_other=False):
    try:
        result = {
            'evaluator': model1,
            'evaluatee': model2,
            'pid': record['pid']
        }
        if use_synonym:
            answer1 = record[model1+'_output_label'] + ". " + record[model1+'_reason_perturb_llm_auto']
        elif paraphrase_source_external:
            answer1 = record[model1+'_output_label'] + ". " + record[model1+'_reason_paraphrased_external']
        elif sentence_error_source:
            answer1 = record[model1+'_output_label'] + ". " + introduce_spelling_errors(record[model1+'_reason'])
        else:
            answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason']
        
        if use_synonym_other:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+'_reason_perturb_llm_auto']
        elif paraphrase_other_external:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+'_reason_paraphrased_external']
        elif use_paraphrase:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+ '_reason_paraphrased_' + model1]
        elif sentence_error_other:
            answer2 = record[model2+'_output_label'] + ". " + introduce_spelling_errors(record[model2+'_reason'])
        else:
            answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']

        forward_result = await no_context_get_model_choice_qa_comparison_async(
            model1, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await no_context_get_model_choice_qa_comparison_async(
            model1, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


## Original

### Harmful

In [224]:
with open(".\quality\preference_results_no_text_harmful.json", 'r') as file:
    preference_results = json.load(file)

In [230]:
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

No Context: Evaluating Preferences: 100%|██████████| 251/251 [00:37<00:00,  6.76it/s]
No Context: Evaluating Preferences: 100%|██████████| 240/240 [00:37<00:00,  6.48it/s]
No Context: Evaluating Preferences: 100%|██████████| 461/461 [01:02<00:00,  7.43it/s]
No Context: Evaluating Preferences: 100%|██████████| 314/314 [00:41<00:00,  7.57it/s]


In [None]:
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########


In [233]:
await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

No Context: Evaluating Preferences: 100%|██████████| 509/509 [00:24<00:00, 20.45it/s]
No Context: Evaluating Preferences: 100%|██████████| 575/575 [00:17<00:00, 32.49it/s]
No Context: Evaluating Preferences: 100%|██████████| 167/167 [00:41<00:00,  4.02it/s]


In [234]:
await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

No Context: Evaluating Preferences: 100%|██████████| 401/401 [00:21<00:00, 18.94it/s]
No Context: Evaluating Preferences: 100%|██████████| 479/479 [00:16<00:00, 29.73it/s]
No Context: Evaluating Preferences: 100%|██████████| 140/140 [00:20<00:00,  6.95it/s]


In [235]:
with open(".\quality\preference_results_no_text_harmful.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [None]:
with open(".\quality\preference_results_no_text_harmful_other_wrong.json", 'r') as file:
    preference_results = json.load(file)

## Synonym (Llm-Auto)

### Harmful

In [238]:
preference_results = []
failed_comparisons = []

In [239]:
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
    

No Context: Evaluating Preferences:   0%|          | 0/251 [00:00<?, ?it/s]

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error code: 500 - {"message": "Internal Server Error"}


No Context: Evaluating Preferences:  75%|███████▍  | 188/251 [00:31<00:04, 14.34it/s]

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error communicating with Together


No Context: Evaluating Preferences: 100%|██████████| 251/251 [00:40<00:00,  6.26it/s]
No Context: Evaluating Preferences: 100%|██████████| 2/2 [00:06<00:00,  3.18s/it]
No Context: Evaluating Preferences: 100%|██████████| 240/240 [00:39<00:00,  6.08it/s]
No Context: Evaluating Preferences: 100%|██████████| 461/461 [01:08<00:00,  6.71it/s]
No Context: Evaluating Preferences: 100%|██████████| 314/314 [01:00<00:00,  5.23it/s]


In [240]:
await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########

No Context: Evaluating Preferences: 100%|██████████| 509/509 [00:29<00:00, 17.36it/s]
No Context: Evaluating Preferences: 100%|██████████| 318/318 [00:14<00:00, 22.01it/s]
No Context: Evaluating Preferences: 100%|██████████| 616/616 [00:32<00:00, 19.04it/s]
No Context: Evaluating Preferences: 100%|██████████| 401/401 [00:21<00:00, 19.03it/s]


In [241]:
await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########



No Context: Evaluating Preferences:   0%|          | 0/575 [00:00<?, ?it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>



No Context: Evaluating Preferences: 100%|██████████| 575/575 [00:36<00:00, 15.75it/s]
No Context: Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.38s/it]
No Context: Evaluating Preferences: 100%|██████████| 406/406 [00:26<00:00, 15.58it/s]
No Context: Evaluating Preferences:   0%|          | 0/714 [00:00<?, ?it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 500 - Error code: 500 -
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 500 - Error code: 500 -


No Context: Evaluating Preferences: 100%|██████████| 714/714 [00:27<00:00, 26.28it/s]
No Context: Evaluating Preferences: 100%|██████████| 2/2 [00:00<00:00,  3.86it/s]
No Context: Evaluating Preferences:  60%|█████▉    | 286/479 [00:14<00:03, 58.48it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 500 - Error code: 500 -


No Context: Evaluating Preferences: 100%|██████████| 479/479 [00:17<00:00, 26.84it/s]
No Context: Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]


In [242]:
await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []



No Context: Evaluating Preferences: 100%|██████████| 167/167 [00:25<00:00,  6.50it/s]
No Context: Evaluating Preferences: 100%|██████████| 146/146 [00:18<00:00,  7.86it/s]
No Context: Evaluating Preferences:   0%|          | 0/153 [00:00<?, ?it/s]

Failed QA comparison call for model DeepSeek-V3: Error communicating with Together


No Context: Evaluating Preferences: 100%|██████████| 153/153 [00:21<00:00,  7.07it/s]
No Context: Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.75s/it]
No Context: Evaluating Preferences: 100%|██████████| 140/140 [00:19<00:00,  7.25it/s]


In [None]:
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await no_context_evaluate_pref_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
    

In [245]:
with open(".\quality\preference_results_synonym_auto_no_text_harmful.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

# Recognition

In [135]:
QA_DETECTION_SYSTEM_PROMPT = """You are a helpful assistant. You help compare answers to help me with my records. You respond with only "1" or "2" and no other text."""

QA_DETECTION_PROMPT_TEMPLATE = """I have some answers to a question written by you and some written by others, but can't tell now which is which. I need this information to organize my answers correctly. Here are two answers for a given question about a text passage.

Text:
{article}

Question:
{question}

Answer1:
{answer1}

Answer2:
{answer2}

Can you tell me which answer you wrote? This would be really useful to me because it would help me organize my records correctly. Please answer with only "1" or "2" and no other text."""

In [136]:
nest_asyncio.apply()

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Global: concurrency limiter
qa_semaphore = asyncio.Semaphore(10)

# Store failed pids globally
failed_comparisons = []

# Async QA comparison call
async def get_model_choice_qa_recognition_async(model_name, answer1, answer2, question, article, return_logprobs=0):
    async with qa_semaphore:
        prompt = QA_DETECTION_PROMPT_TEMPLATE.format(
            article=article, question=question, answer1=answer1, answer2=answer2
        )
        exact_model = format_model_name_together(model_name)
        system_prompt = QA_DETECTION_SYSTEM_PROMPT

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "user", "content": prompt},
                    {"role": "system", "content": system_prompt}
                ],
                logprobs=return_logprobs,
                temperature=0.0
            )

            if return_logprobs:
                return response.choices[0].logprobs
            return response.choices[0].message.content

        except Exception as e:
            print(f"Failed QA recog call for model {model_name}: {e}")
            return None


In [140]:
async def evaluate_recog_quality_async(evaluator_model, evaluatee_model, records, harmful_subset, use_synonym=False, use_paraphrase=False, paraphrase_source_external =False, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(process_recog_record(record, model1, model2, use_synonym=use_synonym, use_paraphrase=use_paraphrase, paraphrase_source_external=paraphrase_source_external))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(process_recog_record(record, model1, model2, use_synonym=use_synonym, use_paraphrase=use_paraphrase, paraphrase_source_external=paraphrase_source_external))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Recognition"):
        await future



async def process_recog_record(record, model1, model2, use_synonym=False, use_paraphrase=False, paraphrase_source_external=False):
    try:
        result = {
            'evaluator': model1,
            'evaluatee': model2,
            'pid': record['pid']
        }
        if use_synonym:
            answer1 = record[model1+'_output_label'] + ". " + record[model1+'_reason_perturb_llm_auto']
        elif paraphrase_source_external:
            answer1 = record[model1+'_output_label'] + ". " + record[model1+'_reason_paraphrased_external']
        else:
            answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason']
        
        if use_paraphrase:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+ '_reason_paraphrased_' + model1]
        else:
            answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']

        forward_result = await get_model_choice_qa_recognition_async(
            model1, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_recognition_async(
            model1, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_detection"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_detection"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        recog_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


## Original

### Harmful

In [62]:
with open('.\quality\self_recog_quality.json', 'r') as file:
    recog_results = json.load(file)

In [71]:
len(recog_results)

5480

In [64]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)

Evaluating Recognition: 100%|██████████| 251/251 [00:18<00:00, 13.62it/s]


In [67]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########

Evaluating Recognition: 100%|██████████| 240/240 [01:50<00:00,  2.18it/s]
Evaluating Recognition: 100%|██████████| 461/461 [00:27<00:00, 16.98it/s]
Evaluating Recognition: 100%|██████████| 314/314 [00:17<00:00, 17.87it/s]


In [68]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8t", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 226/226 [00:25<00:00,  8.86it/s]
Evaluating Recognition: 100%|██████████| 200/200 [00:43<00:00,  4.55it/s]
Evaluating Recognition: 100%|██████████| 353/353 [00:37<00:00,  9.52it/s]
Evaluating Recognition: 0it [00:00, ?it/s]


In [None]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

In [73]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 401/401 [01:02<00:00,  6.42it/s]
Evaluating Recognition: 100%|██████████| 479/479 [00:41<00:00, 11.51it/s]
Evaluating Recognition: 100%|██████████| 140/140 [01:36<00:00,  1.45it/s]


In [74]:
with open('.\quality\self_recog_quality.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [75]:
with open('.\quality\self_recog_quality_other_wrong.json', 'r') as file:
    recog_results = json.load(file)

In [76]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 401/401 [03:29<00:00,  1.91it/s]
Evaluating Recognition:  10%|█         | 49/479 [00:16<00:24, 17.75it/s] 

Failed QA recog call for model Llama-4-Scout-17B-16E-Instruct: Error code: 400 - {"message": "Input validation error", "type_": "invalid_request_error"}


Evaluating Recognition: 100%|██████████| 479/479 [00:28<00:00, 16.91it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
Evaluating Recognition: 100%|██████████| 140/140 [00:07<00:00, 18.52it/s]
Evaluating Recognition: 100%|██████████| 179/179 [00:09<00:00, 18.56it/s]


In [77]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 509/509 [00:42<00:00, 12.10it/s]
Evaluating Recognition: 100%|██████████| 575/575 [00:38<00:00, 15.05it/s]
Evaluating Recognition: 100%|██████████| 167/167 [00:10<00:00, 15.28it/s]
Evaluating Recognition: 100%|██████████| 314/314 [00:20<00:00, 15.60it/s]


In [78]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8",  responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 226/226 [00:38<00:00,  5.91it/s]
Evaluating Recognition:   0%|          | 0/200 [00:00<?, ?it/s]

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error communicating with Together


Evaluating Recognition: 100%|██████████| 200/200 [02:56<00:00,  1.13it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]
Evaluating Recognition: 100%|██████████| 353/353 [05:28<00:00,  1.08it/s]  


In [79]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct",  responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition:  99%|█████████▉| 249/251 [00:54<00:00, 13.08it/s]

Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 251/251 [01:46<00:00,  2.35it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it]
Evaluating Recognition: 100%|██████████| 240/240 [00:34<00:00,  7.00it/s]
Evaluating Recognition: 100%|██████████| 461/461 [06:46<00:00,  1.14it/s]  


In [81]:
len(recog_results)

6848

In [82]:
with open('.\quality\self_recog_quality_other_wrong.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

## Synonym (Auto)

### Harmful

In [9]:
recog_results = []
failed_comparisons = []

In [84]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 226/226 [00:14<00:00, 15.19it/s]
Evaluating Recognition: 100%|██████████| 200/200 [00:12<00:00, 15.64it/s]
Evaluating Recognition: 100%|██████████| 353/353 [00:21<00:00, 16.30it/s]
Evaluating Recognition: 100%|██████████| 179/179 [00:11<00:00, 16.19it/s]


In [85]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Recognition: 100%|██████████| 251/251 [00:13<00:00, 18.22it/s]
Evaluating Recognition: 100%|██████████| 240/240 [00:13<00:00, 18.45it/s]
Evaluating Recognition: 100%|██████████| 461/461 [03:24<00:00,  2.25it/s]
Evaluating Recognition: 100%|██████████| 314/314 [03:05<00:00,  1.69it/s]


In [86]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 509/509 [04:03<00:00,  2.09it/s]
Evaluating Recognition: 100%|██████████| 401/401 [03:09<00:00,  2.11it/s]
Evaluating Recognition: 100%|██████████| 318/318 [00:51<00:00,  6.17it/s]
Evaluating Recognition: 100%|██████████| 616/616 [01:39<00:00,  6.18it/s]


In [87]:
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 167/167 [02:11<00:00,  1.27it/s]
Evaluating Recognition: 100%|██████████| 140/140 [01:59<00:00,  1.17it/s]
Evaluating Recognition: 100%|██████████| 146/146 [02:19<00:00,  1.05it/s]
Evaluating Recognition: 100%|██████████| 153/153 [02:22<00:00,  1.07it/s]


In [88]:
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 575/575 [06:08<00:00,  1.56it/s] 
Evaluating Recognition: 100%|██████████| 479/479 [03:08<00:00,  2.54it/s]
Evaluating Recognition: 100%|██████████| 406/406 [03:05<00:00,  2.19it/s]
Evaluating Recognition: 100%|██████████| 714/714 [06:14<00:00,  1.91it/s] 


In [89]:
with open('.\quality\self_recog_quality_synonym_auto_harmful.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [90]:
recog_results = []
failed_comparisons = []

In [91]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 509/509 [00:50<00:00, 10.06it/s]
Evaluating Recognition: 100%|██████████| 575/575 [00:42<00:00, 13.49it/s]
Evaluating Recognition: 100%|██████████| 167/167 [00:11<00:00, 14.58it/s]
Evaluating Recognition: 100%|██████████| 314/314 [00:21<00:00, 14.78it/s]


In [92]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Recognition: 100%|██████████| 401/401 [00:33<00:00, 11.91it/s]
Evaluating Recognition: 100%|██████████| 479/479 [03:04<00:00,  2.59it/s]
Evaluating Recognition: 100%|██████████| 140/140 [03:04<00:00,  1.32s/it]
Evaluating Recognition: 100%|██████████| 179/179 [03:07<00:00,  1.05s/it]


In [93]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 226/226 [00:37<00:00,  6.01it/s]
Evaluating Recognition:   0%|          | 0/251 [00:00<?, ?it/s]

Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Error communicating with Together


Evaluating Recognition: 100%|██████████| 251/251 [00:43<00:00,  5.73it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
Evaluating Recognition: 100%|██████████| 406/406 [03:47<00:00,  1.78it/s]
Evaluating Recognition: 100%|██████████| 146/146 [00:25<00:00,  5.76it/s]


In [94]:
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 353/353 [03:45<00:00,  1.57it/s]  
Evaluating Recognition: 100%|██████████| 461/461 [04:39<00:00,  1.65it/s]  
Evaluating Recognition: 100%|██████████| 616/616 [06:26<00:00,  1.59it/s]  
Evaluating Recognition:   0%|          | 0/714 [00:00<?, ?it/s]

Failed QA recog call for model DeepSeek-V3: Error communicating with Together


Evaluating Recognition: 100%|██████████| 714/714 [09:29<00:00,  1.25it/s]  
Evaluating Recognition: 100%|██████████| 1/1 [00:07<00:00,  7.33s/it]


In [None]:
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

In [96]:
with open('.\quality\self_recog_quality_synonym_auto_beneficial.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

## Paraphrase (source answer with external)

### Harmful

In [139]:
recog_results = []
failed_comparisons = []

In [None]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
    

In [None]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []

In [147]:
with open('.\quality\self_recog_quality_paraphrase_source_external_harmful.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

## Paraphrasing (Competitor using Source Model)

### Harmful

In [10]:
recog_results = []
failed_comparisons = []

In [11]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 226/226 [00:20<00:00, 11.23it/s]
Evaluating Recognition: 100%|██████████| 200/200 [00:15<00:00, 12.64it/s]
Evaluating Recognition: 100%|██████████| 353/353 [00:24<00:00, 14.45it/s]
Evaluating Recognition: 100%|██████████| 179/179 [00:12<00:00, 14.82it/s]


In [12]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Recognition:  54%|█████▍    | 135/251 [00:20<00:08, 14.01it/s]

Failed QA recog call for model Llama-4-Scout-17B-16E-Instruct: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 251/251 [00:26<00:00,  9.51it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
Evaluating Recognition:  67%|██████▋   | 161/240 [01:58<37:58, 28.84s/it]

Failed QA recog call for model Llama-4-Scout-17B-16E-Instruct: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Recognition: 100%|██████████| 240/240 [03:11<00:00,  1.25it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
Evaluating Recognition:   0%|          | 0/461 [00:00<?, ?it/s]

Failed to process record 51320_4G14XR5O_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 51274_8Q2YNHG5_6_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition:  99%|█████████▉| 457/461 [00:45<00:01,  2.94it/s]

Failed QA recog call for model Llama-4-Scout-17B-16E-Instruct: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 461/461 [03:04<00:00,  2.50it/s]
Evaluating Recognition:   0%|          | 0/3 [00:00<?, ?it/s]

Failed to process record 51274_8Q2YNHG5_6_0: can only concatenate str (not "NoneType") to str
Failed to process record 51320_4G14XR5O_3_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 3/3 [00:00<00:00,  4.57it/s]
Evaluating Recognition: 100%|██████████| 314/314 [00:22<00:00, 13.71it/s]


In [13]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition:   0%|          | 0/509 [00:00<?, ?it/s]

Failed to process record 31736_TV0CUXDH_2_0: can only concatenate str (not "NoneType") to str
Failed to process record 30035_C0HFCNPI_1_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition:  13%|█▎        | 67/509 [00:47<00:31, 14.10it/s] 

Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Error communicating with Together


Evaluating Recognition:  56%|█████▌    | 283/509 [01:10<00:18, 12.18it/s]

Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Error communicating with Together


Evaluating Recognition: 100%|██████████| 509/509 [11:08<00:00,  1.31s/it] 


Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Request timed out


Evaluating Recognition:   0%|          | 0/5 [00:00<?, ?it/s]

Failed to process record 31736_TV0CUXDH_2_0: can only concatenate str (not "NoneType") to str
Failed to process record 30035_C0HFCNPI_1_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 5/5 [00:01<00:00,  3.96it/s]
Evaluating Recognition: 100%|██████████| 401/401 [10:33<00:00,  1.58s/it]


Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Request timed out


Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it]
Evaluating Recognition: 100%|█████████▉| 317/318 [03:41<00:15, 15.36s/it]

Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 318/318 [04:48<00:00,  1.10it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Evaluating Recognition: 100%|██████████| 616/616 [04:03<00:00,  2.53it/s]


In [14]:
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 167/167 [01:36<00:00,  1.73it/s]
Evaluating Recognition: 100%|██████████| 140/140 [01:12<00:00,  1.92it/s]
Evaluating Recognition: 100%|██████████| 146/146 [01:15<00:00,  1.93it/s]
Evaluating Recognition: 100%|██████████| 153/153 [01:12<00:00,  2.11it/s]


In [15]:
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition:   0%|          | 0/575 [00:00<?, ?it/s]

Failed to process record 62085_C1SL2YBE_4_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition:  99%|█████████▉| 572/575 [01:26<00:00, 17.91it/s]

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Request timed out


Evaluating Recognition: 100%|██████████| 575/575 [10:33<00:00,  1.10s/it]
Evaluating Recognition:   0%|          | 0/2 [00:00<?, ?it/s]

Failed to process record 62085_C1SL2YBE_4_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s]
Evaluating Recognition:   0%|          | 0/479 [00:00<?, ?it/s]

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition:  11%|█         | 51/479 [00:56<02:15,  3.16it/s] 

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition:  63%|██████▎   | 303/479 [03:40<01:04,  2.71it/s]  

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 479/479 [04:17<00:00,  1.86it/s]
Evaluating Recognition: 100%|██████████| 3/3 [00:04<00:00,  1.45s/it]
Evaluating Recognition:  99%|█████████▉| 401/406 [03:31<00:00, 11.92it/s]

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|█████████▉| 404/406 [03:44<00:07,  3.83s/it]

Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 406/406 [05:35<00:00,  1.21it/s]


Failed QA recog call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Recognition: 100%|██████████| 3/3 [00:00<00:00,  3.03it/s]
Evaluating Recognition: 100%|██████████| 714/714 [01:36<00:00,  7.38it/s]


In [16]:
with open('.\quality\self_recog_quality_paraphrase_harmful.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [17]:
recog_results = []
failed_comparisons = []

In [18]:
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 509/509 [00:36<00:00, 14.07it/s]
Evaluating Recognition: 100%|██████████| 575/575 [00:37<00:00, 15.14it/s]
Evaluating Recognition: 100%|██████████| 167/167 [00:11<00:00, 13.97it/s]
Evaluating Recognition: 100%|██████████| 314/314 [00:22<00:00, 14.07it/s]


In [19]:
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Recognition:   0%|          | 0/401 [00:00<?, ?it/s]

Failed to process record 24290_VOTN7PR9_7_0: can only concatenate str (not "NoneType") to str
Failed to process record 51436_MT3ROY6U_2_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 401/401 [03:19<00:00,  2.01it/s]
Evaluating Recognition: 100%|██████████| 2/2 [00:00<?, ?it/s]


Failed to process record 24290_VOTN7PR9_7_0: can only concatenate str (not "NoneType") to str
Failed to process record 51436_MT3ROY6U_2_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition:   0%|          | 0/479 [00:00<?, ?it/s]

Failed to process record 23563_HRCOMZPJ_9_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 479/479 [00:27<00:00, 17.62it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:00<?, ?it/s]


Failed to process record 23563_HRCOMZPJ_9_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 140/140 [00:07<00:00, 19.11it/s]
Evaluating Recognition: 100%|██████████| 179/179 [00:10<00:00, 16.66it/s]


In [20]:
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 226/226 [10:34<00:00,  2.81s/it] 


Failed QA recog call for model Qwen2.5-7B-Instruct-Turbo: Request timed out


Evaluating Recognition: 100%|██████████| 1/1 [00:01<00:00,  1.44s/it]
Evaluating Recognition: 100%|██████████| 251/251 [00:39<00:00,  6.38it/s]
Evaluating Recognition: 100%|██████████| 406/406 [03:31<00:00,  1.92it/s]
Evaluating Recognition: 100%|██████████| 146/146 [00:24<00:00,  6.02it/s]


In [21]:
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition: 100%|██████████| 353/353 [02:18<00:00,  2.54it/s]
Evaluating Recognition: 100%|██████████| 461/461 [03:17<00:00,  2.34it/s] 
Evaluating Recognition: 100%|██████████| 616/616 [03:49<00:00,  2.68it/s]  
Evaluating Recognition:  88%|████████▊ | 625/714 [04:33<00:32,  2.73it/s]  

Failed QA recog call for model DeepSeek-V3: Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>



Evaluating Recognition: 100%|██████████| 714/714 [04:52<00:00,  2.44it/s]
Evaluating Recognition: 100%|██████████| 1/1 [00:02<00:00,  2.29s/it]


In [22]:
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_recog_quality_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Recognition:   0%|          | 0/200 [00:00<?, ?it/s]

Failed to process record 22876_2BBI3WOT_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 60745_U9M4CL5M_3_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 200/200 [00:14<00:00, 13.49it/s]
Evaluating Recognition: 100%|██████████| 2/2 [00:00<00:00, 1999.19it/s]


Failed to process record 22876_2BBI3WOT_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 60745_U9M4CL5M_3_0: can only concatenate str (not "NoneType") to str


Evaluating Recognition: 100%|██████████| 240/240 [00:17<00:00, 13.97it/s]
Evaluating Recognition: 100%|██████████| 318/318 [03:20<00:00,  1.59it/s]
Evaluating Recognition: 100%|██████████| 153/153 [03:05<00:00,  1.21s/it]


In [23]:
with open('.\quality\self_recog_quality_paraphrase_beneficial.json', 'w') as f:
    json.dump(recog_results, f, indent=4)  # indent=4 makes it more readable

# Third Party Judge 

In [18]:
async def evaluate_pref_quality_third_party_async(judge_model, evaluator_model, evaluatee_model, records, use_synonym=False, 
                                                  use_synonym_other=False, use_paraphrase=False, 
                                                  paraphrase_source_external=False, sentence_error_source=False, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        gt_label = record['output_label']
        pid = record.get('pid')
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        # Model 2 is the right answer always 
        if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
            tasks.append(process_pref_record_third_party_perturb_wrong(record, judge_model, model1, model2, use_synonym=use_synonym, 
                                                                       use_synonym_other=use_synonym_other, use_paraphrase=use_paraphrase, 
                                                                       paraphrase_source_external=paraphrase_source_external, sentence_error_source=sentence_error_source))
        if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
            tasks.append(process_pref_record_third_party_perturb_wrong(record, judge_model, model2, model1, use_synonym=use_synonym, 
                                                                       use_synonym_other=use_synonym_other, use_paraphrase=use_paraphrase, 
                                                                       paraphrase_source_external=paraphrase_source_external, sentence_error_source=sentence_error_source))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future



async def process_pref_record_third_party_perturb_wrong(record, judge_model, model1, model2, use_synonym, use_synonym_other, use_paraphrase, paraphrase_source_external, sentence_error_source):
    try:
        result = {
            'judge_model': judge_model,
            'correct_answer_model': model2,
            'wrong_answer_model': model1,
            'pid': record['pid']
        }
        if use_synonym:
            answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason_perturb_llm_auto']
        elif sentence_error_source:
            answer1 = record[model1+'_output_label'] + ". " + introduce_spelling_errors(record[model1+'_reason'])
        else:
            answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason']
        
        if use_synonym_other:
            answer2 = record[model2+'_output_label'] + ". " + record[model2+'_reason_perturb_llm_auto']
        elif use_paraphrase:
            answer2 = record[model2 + '_output_label'] + ". " + record[model2+ '_reason_paraphrased_' + model1]
        else:
            answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']

        if paraphrase_source_external:
            key1 = model1 + '_reason_paraphrased_external'
            key2 = model2 + '_reason_paraphrased_external'
            # Check and assign accordingly
            # if key1 in record:
            #     answer1 = record[model1 + '_output_label'] + ". " + record[model1+ '_reason_paraphrased_external']
            #     answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']
            if key2 in record:
                answer2 = record[model2 + '_output_label'] + ". " + record[model2+ '_reason_paraphrased_external']
                answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason']
            else:
                return None  # Skip this record if neither paraphrase exists
            
            
        forward_result = await get_model_choice_qa_comparison_async(
            judge_model, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_comparison_async(
            judge_model, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


## Original

### Llama4.1 judges

In [49]:
with open(".\quality\pref_third_party_llamajudge_quality.json", "r") as f:
    preference_results = json.load(f)

In [13]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences:  48%|████▊     | 345/724 [00:34<00:13, 28.74it/s]

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 724/724 [00:48<00:00, 14.78it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
Evaluating Preferences: 100%|██████████| 867/867 [03:52<00:00,  3.73it/s]


Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
Evaluating Preferences:   0%|          | 0/762 [00:00<?, ?it/s]

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error code: 503 - The server is overloaded or not ready yet.


Evaluating Preferences: 100%|██████████| 762/762 [05:35<00:00,  2.27it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:03<00:00,  3.69s/it]


In [51]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 735/735 [01:07<00:00, 10.85it/s] 
Evaluating Preferences: 100%|██████████| 775/775 [00:41<00:00, 18.85it/s]
Evaluating Preferences:   0%|          | 0/520 [00:00<?, ?it/s]

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error communicating with Together


Evaluating Preferences: 100%|██████████| 520/520 [00:28<00:00, 18.54it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  2.47it/s]


In [14]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 762/762 [01:00<00:00, 12.67it/s]
Evaluating Preferences: 100%|██████████| 724/724 [00:50<00:00, 14.28it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:56<00:00, 15.48it/s]


In [52]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 601/601 [00:37<00:00, 16.23it/s]
Evaluating Preferences: 100%|██████████| 652/652 [00:37<00:00, 17.38it/s]
Evaluating Preferences: 100%|██████████| 719/719 [00:41<00:00, 17.18it/s]


In [53]:
with open(".\quality\pref_third_party_llamajudge_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

### Continue Old Models

In [54]:
with open(".\quality\pref_results_third_party_eval_original.json", "r") as f:
    preference_results = json.load(f)

In [56]:
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 735/735 [01:24<00:00,  8.72it/s] 
Evaluating Preferences: 100%|██████████| 652/652 [01:12<00:00,  9.01it/s]
Evaluating Preferences: 100%|██████████| 493/493 [01:17<00:00,  6.36it/s]
Evaluating Preferences: 100%|██████████| 601/601 [02:16<00:00,  4.40it/s] 
Evaluating Preferences: 100%|██████████| 520/520 [02:10<00:00,  4.00it/s]


In [57]:
await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences:  37%|███▋      | 272/735 [02:46<01:39,  4.65it/s]  

Failed QA comparison call for model DeepSeek-V3: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 735/735 [04:43<00:00,  2.59it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]
Evaluating Preferences: 100%|██████████| 652/652 [04:13<00:00,  2.57it/s]  
Evaluating Preferences: 100%|██████████| 493/493 [04:22<00:00,  1.88it/s] 
Evaluating Preferences: 100%|██████████| 719/719 [03:44<00:00,  3.20it/s]  
Evaluating Preferences: 100%|██████████| 775/775 [05:02<00:00,  2.57it/s]  


In [58]:
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 520/520 [01:20<00:00,  6.48it/s]
Evaluating Preferences: 100%|██████████| 601/601 [01:32<00:00,  6.49it/s]
Evaluating Preferences: 100%|██████████| 493/493 [01:15<00:00,  6.52it/s]
Evaluating Preferences: 100%|██████████| 719/719 [01:48<00:00,  6.60it/s] 
Evaluating Preferences: 100%|██████████| 775/775 [02:00<00:00,  6.44it/s] 


In [60]:
with open(".\quality\pref_third_party_llamajudge_quality.json", "r") as f:
    other_preference_results = json.load(f)

In [62]:
preference_results = preference_results + other_preference_results

In [64]:
with open(".\quality\pref_results_third_party_eval_original.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## Perturb (Wrong Answer)

### All Models

In [20]:
preference_results = []
failed_comparisons = []

In [21]:
with open(".\quality\pref_third_party_llamajudge_synonym_auto_wrong_quality.json", "r") as f:
    preference_results = json.load(f)

In [22]:
len(preference_results)

4706

In [None]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########


In [23]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 735/735 [00:55<00:00, 13.23it/s]
Evaluating Preferences:   9%|▊         | 67/775 [00:25<00:23, 30.30it/s] 

Failed QA comparison call for model Llama-4-Scout-17B-16E-Instruct: Error communicating with Together


Evaluating Preferences: 100%|██████████| 775/775 [00:46<00:00, 16.81it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
Evaluating Preferences: 100%|██████████| 520/520 [00:31<00:00, 16.46it/s]


In [None]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########


Evaluating Preferences: 100%|██████████| 724/724 [01:01<00:00, 11.81it/s]
Evaluating Preferences: 100%|██████████| 867/867 [01:19<00:00, 10.95it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:54<00:00, 14.08it/s]


In [24]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8",  "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8",  "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 652/652 [00:52<00:00, 12.38it/s]
Evaluating Preferences: 100%|██████████| 719/719 [00:47<00:00, 15.26it/s]
Evaluating Preferences: 100%|██████████| 601/601 [00:37<00:00, 15.89it/s]


In [25]:
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 867/867 [03:11<00:00,  4.53it/s] 
Evaluating Preferences: 100%|██████████| 601/601 [02:37<00:00,  3.82it/s] 
Evaluating Preferences: 100%|██████████| 493/493 [02:01<00:00,  4.05it/s]
Evaluating Preferences: 100%|██████████| 775/775 [02:40<00:00,  4.81it/s] 
Evaluating Preferences: 100%|██████████| 520/520 [01:45<00:00,  4.91it/s]
Evaluating Preferences:   0%|          | 0/719 [00:00<?, ?it/s]

Failed QA comparison call for model Qwen2.5-7B-Instruct-Turbo: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 719/719 [01:46<00:00,  6.78it/s] 
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]


In [26]:
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########    

Evaluating Preferences: 100%|██████████| 724/724 [08:21<00:00,  1.45it/s]  
Evaluating Preferences: 100%|██████████| 775/775 [09:51<00:00,  1.31it/s]  
Evaluating Preferences:   0%|          | 0/719 [00:00<?, ?it/s]

Failed QA comparison call for model DeepSeek-V3: Error communicating with Together
Failed QA comparison call for model DeepSeek-V3: Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>



Evaluating Preferences:  66%|██████▌   | 471/719 [04:54<00:50,  4.92it/s]  

Failed QA comparison call for model DeepSeek-V3: Error communicating with Together


Evaluating Preferences: 100%|██████████| 719/719 [05:51<00:00,  2.04it/s]
Evaluating Preferences: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]
Evaluating Preferences: 100%|██████████| 652/652 [04:19<00:00,  2.51it/s]  
Evaluating Preferences:  72%|███████▏  | 526/735 [04:58<00:32,  6.44it/s]  

Failed QA comparison call for model DeepSeek-V3: Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>



Evaluating Preferences: 100%|██████████| 735/735 [06:06<00:00,  2.01it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:04<00:00,  4.57s/it]
Evaluating Preferences:   0%|          | 0/493 [00:00<?, ?it/s]

Failed QA comparison call for model DeepSeek-V3: Error code: 502 - Error code: 502 -<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>



Evaluating Preferences: 100%|██████████| 493/493 [05:48<00:00,  1.41it/s]  
Evaluating Preferences: 100%|██████████| 1/1 [00:04<00:00,  4.26s/it]


In [27]:
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences:   0%|          | 0/762 [00:00<?, ?it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 503 - The server is overloaded or not ready yet.
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 5

Evaluating Preferences: 100%|██████████| 762/762 [02:41<00:00,  4.73it/s]  
Evaluating Preferences: 100%|██████████| 10/10 [00:01<00:00,  7.12it/s]
Evaluating Preferences: 100%|██████████| 520/520 [00:48<00:00, 10.64it/s]
Evaluating Preferences: 100%|██████████| 601/601 [00:57<00:00, 10.48it/s]
Evaluating Preferences: 100%|██████████| 652/652 [00:58<00:00, 11.20it/s]
Evaluating Preferences: 100%|██████████| 735/735 [01:20<00:00,  9.18it/s]
Evaluating Preferences: 100%|██████████| 493/493 [01:01<00:00,  8.07it/s]


In [28]:
with open(".\quality\pref_third_party_all_models_synonym_auto_wrong_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

### Smaller Subset (old models)

In [128]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
#####
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
#####
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []

######


In [92]:
with open(".\quality\pref_third_party_synonym_auto_wrong_only_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## Perturb (Right Answer)

In [17]:
preference_results = []
failed_comparisons = []

In [None]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
    

In [20]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8",  "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8",  "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 724/724 [00:51<00:00, 14.08it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:49<00:00, 17.46it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:43<00:00, 17.58it/s]
Evaluating Preferences: 100%|██████████| 652/652 [00:35<00:00, 18.24it/s]
Evaluating Preferences: 100%|██████████| 719/719 [00:40<00:00, 17.76it/s]
Evaluating Preferences: 100%|██████████| 601/601 [00:32<00:00, 18.31it/s]


In [21]:
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 867/867 [02:39<00:00,  5.45it/s] 
Evaluating Preferences: 100%|██████████| 601/601 [01:54<00:00,  5.25it/s]
Evaluating Preferences: 100%|██████████| 493/493 [01:35<00:00,  5.14it/s]
Evaluating Preferences: 100%|██████████| 775/775 [02:32<00:00,  5.10it/s] 
Evaluating Preferences: 100%|██████████| 520/520 [01:44<00:00,  4.98it/s]
Evaluating Preferences: 100%|██████████| 719/719 [02:18<00:00,  5.19it/s] 


In [22]:
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########    

Evaluating Preferences: 100%|██████████| 724/724 [03:53<00:00,  3.10it/s]  
Evaluating Preferences: 100%|██████████| 775/775 [04:22<00:00,  2.95it/s]  
Evaluating Preferences: 100%|██████████| 719/719 [04:54<00:00,  2.44it/s]  
Evaluating Preferences: 100%|██████████| 652/652 [05:08<00:00,  2.11it/s]  
Evaluating Preferences: 100%|██████████| 735/735 [05:41<00:00,  2.15it/s]  
Evaluating Preferences: 100%|██████████| 493/493 [04:08<00:00,  1.98it/s]  


In [23]:
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_synonym_other=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 762/762 [03:00<00:00,  4.21it/s] 
Evaluating Preferences: 100%|██████████| 520/520 [02:32<00:00,  3.42it/s]
Evaluating Preferences: 100%|██████████| 601/601 [03:29<00:00,  2.87it/s]  
Evaluating Preferences:  47%|████▋     | 309/652 [01:44<00:39,  8.77it/s] 

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error communicating with Together


Evaluating Preferences: 100%|██████████| 652/652 [02:19<00:00,  4.66it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Evaluating Preferences:  61%|██████    | 448/735 [02:09<00:53,  5.36it/s] 

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 735/735 [02:57<00:00,  4.15it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
Evaluating Preferences: 100%|██████████| 493/493 [01:58<00:00,  4.15it/s]


In [24]:
with open(".\quality\pref_third_party_all_models_synonym_auto_right_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## Paraphrase (wrong using external model)

### Llama4.1

In [161]:
preference_results = []
failed_comparisons = []

In [162]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 724/724 [00:00<00:00, 80057.90it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:00<00:00, 123857.68it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:00<00:00, 109021.00it/s]


In [163]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, paraphrase_source_external=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, paraphrase_source_external=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 724/724 [00:00<00:00, 144569.20it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:00<00:00, 145394.49it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:00<00:00, 109341.76it/s]


In [159]:
with open(".\quality\pref_third_party_llamajudge_paraphrase_external_wrong_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## Spelling Error Source

### Llama 4.1

In [None]:
preference_results = []
failed_comparisons = []

In [214]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 724/724 [00:52<00:00, 13.81it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:46<00:00, 18.62it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:38<00:00, 19.56it/s]


In [215]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, sentence_error_source=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, sentence_error_source=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 724/724 [00:51<00:00, 14.17it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:53<00:00, 16.21it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:43<00:00, 17.64it/s]


In [216]:
with open(".\quality\pref_third_party_llamajudge_spelling_error_wrong_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## Paraphrase

### Llama4.1

In [29]:
preference_results = []
failed_comparisons = []

In [30]:
failed_comparisons = []
preference_results = json.load(open(".\quality\pref_third_party_llamajudge_paraphrase_quality.json", "r"))

In [31]:
len(preference_results)

4706

In [None]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []



In [None]:
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
    

In [35]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 724/724 [01:19<00:00,  9.10it/s]
Evaluating Preferences: 100%|██████████| 867/867 [01:00<00:00, 14.36it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:50<00:00, 15.06it/s]


In [33]:
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8",  "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8",  "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 652/652 [01:10<00:00,  9.30it/s]
Evaluating Preferences: 100%|██████████| 719/719 [00:47<00:00, 15.12it/s]
Evaluating Preferences:   0%|          | 0/601 [00:00<?, ?it/s]

Failed to process record 51320_4G14XR5O_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 51274_8Q2YNHG5_6_0: can only concatenate str (not "NoneType") to str


Evaluating Preferences: 100%|██████████| 601/601 [00:38<00:00, 15.50it/s]
Evaluating Preferences: 100%|██████████| 2/2 [00:00<00:00, 2000.62it/s]

Failed to process record 51320_4G14XR5O_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 51274_8Q2YNHG5_6_0: can only concatenate str (not "NoneType") to str





In [None]:
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct","Meta-Llama-3.1-8B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

In [None]:
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########    

In [36]:
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Qwen2.5-7B-Instruct-Turbo", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Qwen2.5-7B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo","Llama-4-Maverick-17B-128E-Instruct-FP8","Llama-4-Scout-17B-16E-Instruct", responses, use_paraphrase=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 762/762 [01:23<00:00,  9.12it/s]
Evaluating Preferences:   0%|          | 0/520 [00:00<?, ?it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error communicating with Together
Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error communicating with Together


Evaluating Preferences: 100%|██████████| 520/520 [00:50<00:00, 10.31it/s]
Evaluating Preferences: 100%|██████████| 2/2 [00:01<00:00,  1.45it/s]
Evaluating Preferences:   0%|          | 0/601 [00:00<?, ?it/s]

Failed to process record 51320_4G14XR5O_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 51274_8Q2YNHG5_6_0: can only concatenate str (not "NoneType") to str


Evaluating Preferences: 100%|██████████| 601/601 [00:56<00:00, 10.55it/s]
Evaluating Preferences: 100%|██████████| 2/2 [00:00<00:00, 1998.24it/s]


Failed to process record 51320_4G14XR5O_3_0: can only concatenate str (not "NoneType") to str
Failed to process record 51274_8Q2YNHG5_6_0: can only concatenate str (not "NoneType") to str


Evaluating Preferences: 100%|██████████| 652/652 [01:03<00:00, 10.20it/s]
Evaluating Preferences:   0%|          | 0/735 [00:00<?, ?it/s]

Failed to process record 30035_C0HFCNPI_1_0: can only concatenate str (not "NoneType") to str
Failed to process record 31736_TV0CUXDH_2_0: can only concatenate str (not "NoneType") to str


Evaluating Preferences: 100%|██████████| 735/735 [01:23<00:00,  8.85it/s]
Evaluating Preferences: 100%|██████████| 2/2 [00:00<00:00, 2004.93it/s]


Failed to process record 30035_C0HFCNPI_1_0: can only concatenate str (not "NoneType") to str
Failed to process record 31736_TV0CUXDH_2_0: can only concatenate str (not "NoneType") to str


Evaluating Preferences: 100%|██████████| 493/493 [01:10<00:00,  6.99it/s]


In [38]:
with open(".\quality\pref_third_party_all_models_paraphrase_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## Both Reasons Are Modified (old corrupt syn)

In [40]:
async def evaluate_pref_quality_third_party_async_both_perturb(judge_model, evaluator_model, evaluatee_model, records, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        gt_label = record['output_label']
        pid = record.get('pid')
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        # Model 2 is the right answer always 
        if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
            tasks.append(process_pref_record_third_party(record, judge_model, model1, model2))
        if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
            tasks.append(process_pref_record_third_party(record, judge_model, model2, model1))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future



async def process_pref_record_third_party(record, judge_model, model1, model2):
    try:
        result = {
            'judge_model': judge_model,
            'correct_answer_model': model2,
            'wrong_answer_model': model1,
            'pid': record['pid']
        }

        answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason_perturb2_meta']
        answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason_perturb2_meta']

        forward_result = await get_model_choice_qa_comparison_async(
            judge_model, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_comparison_async(
            judge_model, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


In [97]:
preference_results = []

In [108]:
await evaluate_pref_quality_third_party_async("Qwen2.5-7B-Instruct-Turbo","Meta-Llama-3.1-8B-Instruct-Turbo","DeepSeek-V3", responses)

Evaluating Preferences: 100%|██████████| 153/153 [00:23<00:00,  6.50it/s]


In [112]:
await evaluate_pref_quality_third_party_async("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses)

Evaluating Preferences:   0%|          | 0/762 [00:00<?, ?it/s]

Failed QA comparison call for model Meta-Llama-3.1-8B-Instruct-Turbo: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 762/762 [02:45<00:00,  4.62it/s] 


In [116]:
await evaluate_pref_quality_third_party_async("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses)

Evaluating Preferences:  92%|█████████▏| 663/724 [12:47<00:21,  2.88it/s]  

Failed QA comparison call for model DeepSeek-V3: Error code: 500 - {"message": "Internal Server Error"}


Evaluating Preferences: 100%|██████████| 724/724 [13:09<00:00,  1.09s/it]


In [118]:
len(preference_results)

2351

In [119]:
with open(".\quality\pref_results_third_party_eval_both_perturb.json", "w") as f:
    json.dump(preference_results, f, indent=4)  # indent=4 makes it more readable

### Llama4.1

In [39]:
preference_results = []
failed_comparisons = []

In [41]:
await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 724/724 [03:49<00:00,  3.15it/s]
Evaluating Preferences: 100%|██████████| 867/867 [03:05<00:00,  4.68it/s]
Evaluating Preferences: 100%|██████████| 762/762 [03:05<00:00,  4.11it/s]


In [44]:
await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_both_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 724/724 [00:54<00:00, 13.18it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:51<00:00, 16.74it/s]
Evaluating Preferences: 100%|██████████| 762/762 [00:47<00:00, 15.94it/s]


In [46]:
with open(".\quality\pref_third_party_llamajudge_synonym_corrupt_both_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

## One reason modified (old corrupt Syn)

In [47]:
async def evaluate_pref_quality_third_party_async_single_perturb(judge_model, evaluator_model, evaluatee_model, records, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        gt_label = record['output_label']
        pid = record.get('pid')
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        # Model 2 is the right answer always 
        if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
            tasks.append(process_pref_record_third_party(record, judge_model, model1, model2))
        if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
            tasks.append(process_pref_record_third_party(record, judge_model, model2, model1))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future



async def process_pref_record_third_party(record, judge_model, model1, model2):
    try:
        result = {
            'judge_model': judge_model,
            'correct_answer_model': model2,
            'wrong_answer_model': model1,
            'pid': record['pid']
        }

        answer1 = record[model1 + '_output_label'] + ". " + record[model1 + '_reason_perturb2_meta']
        answer2 = record[model2 + '_output_label'] + ". " + record[model2 + '_reason']

        forward_result = await get_model_choice_qa_comparison_async(
            judge_model, answer1, answer2, record['questions'], record['text'], return_logprobs=1
        )
        backward_result = await get_model_choice_qa_comparison_async(
            judge_model, answer2, answer1, record['questions'], record['text'], return_logprobs=1
        )

        if not forward_result or not backward_result:
            failed_comparisons.append(record['pid'])
            return
        result["forward_comparison"] = forward_result.tokens[0]
        result["forward_probability"] = exp(forward_result.token_logprobs[0])
        result["backward_comparison"] = backward_result.tokens[0]
        result["backward_probability"] = exp(backward_result.token_logprobs[0])

        preference_results.append(result)

    except Exception as e:
        print(f"Failed to process record {record['pid']}: {e}")
        failed_comparisons.append(record['pid'])


In [48]:
preference_results = []
failed_comparisons = []

In [49]:
await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 724/724 [00:40<00:00, 17.92it/s]
Evaluating Preferences: 100%|██████████| 867/867 [03:29<00:00,  4.13it/s]  
Evaluating Preferences: 100%|██████████| 762/762 [00:41<00:00, 18.32it/s]


In [50]:
await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, repeat_failures=True)
    failed_comparisons = []
########
await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo",  responses)
if (len(failed_comparisons) > 0):
    await evaluate_pref_quality_third_party_async_single_perturb("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, repeat_failures=True)
    failed_comparisons = []



Evaluating Preferences: 100%|██████████| 724/724 [00:43<00:00, 16.53it/s]
Evaluating Preferences: 100%|██████████| 867/867 [00:51<00:00, 16.78it/s]
Evaluating Preferences:  58%|█████▊    | 445/762 [00:33<00:08, 39.18it/s]

Failed QA comparison call for model Llama-4-Maverick-17B-128E-Instruct-FP8: Error communicating with Together


Evaluating Preferences: 100%|██████████| 762/762 [00:42<00:00, 17.86it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]


In [51]:
with open(".\quality\pref_third_party_llamajudge_synonym_corrupt_wrong_quality.json", "w") as f:
    json.dump(preference_results, f, indent=4)  

# Generate Data

## Get the full sentence (synonym - not great)

In [120]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import random
import openai
import os
random.seed(123)

In [None]:
SYNONYM_REPLACER_PROMPT_TEMPLATE = """You are a helpful assistant that rewrites sentences by replacing 2 words with their synonyms while preserving the overall semantics. Below is a question and its answer. 
Question:
{question}

Answer:
{answer}

Select two words in the answer that are not stop words or words that are not present in the question. 
Then, replace them with their synonyms in the answer. Return the modified answer with the two words replaced by their synonyms.
Make sure to not change the meaning of the answer. Return only the modified answer and nothing else."""

In [123]:
nest_asyncio.apply()
async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Global: concurrency limiter
qa_semaphore = asyncio.Semaphore(10)
failed_comparisons = []

async def get_synonym_answer(model_name, answer, question):
    async with qa_semaphore:
        prompt = SYNONYM_REPLACER_PROMPT_TEMPLATE.format(
             question=question, answer=answer,
        )
        exact_model = format_model_name_together(model_name)

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "user", "content": prompt},
                ],
                temperature=0.0
            )

            return response.choices[0].message.content

        except Exception as e:
            print(f"Failed QA comparison call for model {model_name}: {e}")
            return None


In [139]:
record2 = responses[3]

In [129]:
modified_answer1 = await get_synonym_answer("meta-llama/Llama-3.3-70B-Instruct-Turbo", record1['Qwen2.5-7B-Instruct-Turbo_reason'] , record1['questions'])

In [136]:
print(record1['questions'])
print('Original Reason:', record1['Qwen2.5-7B-Instruct-Turbo_reason'])
print('Initial synonym:', record1['Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta'])
print('Auto    synonym:',modified_answer1)

Why does Deirdre get so upset when Blake Past suggests she go to prom with the young man?

 (A) Because Blake is trying to guilt Deirdre into going with the young man by telling her that it'll ease her conscience. 
 (B) Because Deirdre has fallen in love with Blake, despite his age, and wants him to take her to the prom.  
 (C) Because Blake is acting like he's her father, which is a sensitive topic for Deirdre because she lost her real parents. 
 (D) Because the young man gave up his right arm in order to afford tickets to the prom, and this disgusts Deirdre. 
Original Reason: The text mentions that Blake Past suggests Deirdre go to prom with the young man, and Deirdre gets upset because she feels Blake Past is acting like her father, which is sensitive for her due to her lost parents.
Initial synonym: The text mentions that Blake Past suggests Deirdre go to the ball with the young gentleman and Deirdre gets upset because she feels Blake Past is behaving like her father which is sensi

In [141]:
modified_answer2 = await get_synonym_answer("meta-llama/Llama-3.3-70B-Instruct-Turbo", record2['Qwen2.5-7B-Instruct-Turbo_reason'] , record2['questions'])

In [142]:
print(record2['questions'])
print('Original Reason:', record2['Qwen2.5-7B-Instruct-Turbo_reason'])
print('Initial synonym:', record2['Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta'])
print('Auto    synonym:',modified_answer2)

Why did Blake create the three female super-images of Miss Stoddart, Officer Finch, and Vera Velvetskin?

 (A) He feels guilty about having slept with Eldoria which perpetuated the demand for female prostitution. 
 (B) Even though he is a psycheye, he feels guilty about hunting down Sabrina York. 
 (C) He is still grieving his mother's death and regrets not being a more loving son.
 (D) He feels guilty about hurting Deirdre's feelings after her graduation when he ignored their romantic connection, and instead, played the part of a parent. 
Original Reason: The text mentions that Blake created these super-images because he felt guilty about hurting Deirdre's feelings after her graduation, which aligns with option D.
Initial synonym: The text refers that Blake created these because he felt guilty about hurting Deirdre's feelings after her graduation which aligns with option D
Auto    synonym: The text mentions that Blake created these super-images because he felt guilty about hurting Dei

In [143]:
record3 = responses[4]

In [145]:
modified_answer3 = await get_synonym_answer("meta-llama/Llama-3.3-70B-Instruct-Turbo", record3['Qwen2.5-7B-Instruct-Turbo_reason'] , record3['questions'])

In [146]:
print(record3['questions'])
print('Original Reason:', record3['Qwen2.5-7B-Instruct-Turbo_reason'])
print('Initial synonym:', record3['Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta'])
print('Auto    synonym:',modified_answer3)

Sabrina York is 

 (A) a criminal that Blake is hunting
 (B) a psycheye that taught Blake all the tricks
 (C) an old friend of Blake's
 (D) Eldoria's alter ego
Original Reason: The text mentions that Sabrina York's nameplate on the kitchen range read 'Sabrina York', indicating she is Eldoria's alter ego.
Initial synonym: The text mentions that Sabra New York nameplate on the kitchen range read New York indicating she is Eldoria alter ego
Auto    synonym: The text mentions that Sabrina York's nameplate on the kitchen range read 'Sabrina York', indicating she is Eldoria's pseudonym. 

The text was modified by replacing 'alter ego' with 'pseudonym' and 'mentions' with no replacement, instead 'indicates' could be used but 'mentions' was replaced with no word, instead 'text' could be replaced with 'document' or 'indication' could be used but 'indicates' is a better fit for 'mentions', so 'mentions' was replaced with no word and 'range' could be replaced with 'stove' or 'appliance', so 'rang

## Get List of words from LM (auto-synonym)

In [15]:
SYNONYM_SUGGESTER_PROMPT_TEMPLATE = """You are a helpful assistant that helps rewrites sentences. 
Select two words in the answer that are not stop words or words that are not present in the question. 
Then, suggest their replacements with their synonyms in the answer sentence - make sure the suggested words do not change the meaning of the answer. 

### System Output Format:
Respond in **JSON format** with:
- `"selected_words"`: The list of words in the original answer.
- `"replacements"`: The list of replacement words in the same order.

### Question:
{question}

### Answer:
{answer}

### Expected Response Format:
```
{{
  "selected_words": "[word1, word2]",
  "replacements": "[replacement1, replacement2]"
}}
"""

In [34]:
nest_asyncio.apply()
async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Global: concurrency limiter
qa_semaphore = asyncio.Semaphore(10)
failed_comparisons = []

async def get_synonym_list_answer(model_name, answer, question):
    async with qa_semaphore:
        prompt = SYNONYM_SUGGESTER_PROMPT_TEMPLATE.format(
             question=question, answer=answer,
        )
        exact_model = format_model_name_together(model_name)

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "user", "content": prompt},
                ],
                response_format={"type": "json_object"},
                temperature=0.0
            )
            api_response = response.choices[0].message.content
            response_json = fix_json_response(api_response)
            return response.choices[0].message.content

        except Exception as e:
            print(f"Failed QA comparison call for model {model_name}: {e}")
            return None


In [35]:
def apply_reason_modifications(reason_text, modify_list_json):
    modified_text = reason_text
    modify_list_json = json.loads(modify_list_json)
    selected_words = modify_list_json.get("selected_words", [])
    replacements = modify_list_json.get("replacements", [])

    for original, replacement in zip(selected_words, replacements):
        if original and replacement:
            modified_text = modified_text.replace(original, replacement)

    return modified_text


### Sample

In [149]:
record1 = responses[1]

In [151]:
modify_list_json = await get_synonym_list_answer("meta-llama/Llama-3.3-70B-Instruct-Turbo", record1['Qwen2.5-7B-Instruct-Turbo_reason'] , record1['questions'])

In [161]:
modify_list_json

'{\n  "selected_words": ["sensitive", "lost"],\n  "replacements": ["emotional", "deceased"]\n}'

In [168]:
modified_answer1 = apply_reason_modifications(record1['Qwen2.5-7B-Instruct-Turbo_reason'], modify_list_json)
print(record1['questions'])
print('Original Reason:', record1['Qwen2.5-7B-Instruct-Turbo_reason'])
print('Initial synonym:', record1['Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta'])
print('Auto    synonym:',modified_answer1)

Why does Deirdre get so upset when Blake Past suggests she go to prom with the young man?

 (A) Because Blake is trying to guilt Deirdre into going with the young man by telling her that it'll ease her conscience. 
 (B) Because Deirdre has fallen in love with Blake, despite his age, and wants him to take her to the prom.  
 (C) Because Blake is acting like he's her father, which is a sensitive topic for Deirdre because she lost her real parents. 
 (D) Because the young man gave up his right arm in order to afford tickets to the prom, and this disgusts Deirdre. 
Original Reason: The text mentions that Blake Past suggests Deirdre go to prom with the young man, and Deirdre gets upset because she feels Blake Past is acting like her father, which is sensitive for her due to her lost parents.
Initial synonym: The text mentions that Blake Past suggests Deirdre go to the ball with the young gentleman and Deirdre gets upset because she feels Blake Past is behaving like her father which is sensi

### Generate for the whole responses

In [36]:
import asyncio
from tqdm.asyncio import tqdm_asyncio

# List of model base names
target_models = [
    "Qwen2.5-7B-Instruct-Turbo",
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "DeepSeek-V3",
    "Llama-4-Scout-17B-16E-Instruct",
    "Llama-4-Maverick-17B-128E-Instruct-FP8"
]

# Semaphore to limit concurrency
perturb_semaphore = asyncio.Semaphore(10)

async def process_reason_perturbation(record, model_name):
    async with perturb_semaphore:
        question = record.get("questions", "")
        reason_key = f"{model_name}_reason"
        perturb_key = f"{model_name}_reason_perturb_llm_auto"

        if reason_key not in record or not record[reason_key] or not question:
            return  # Skip if reason or question missing

        try:
            modify_list_json = await get_synonym_list_answer(
                "meta-llama/Llama-3.3-70B-Instruct-Turbo",
                record[reason_key],
                question
            )
            modified_answer = apply_reason_modifications(record[reason_key], modify_list_json)
            record[perturb_key] = modified_answer
        except Exception as e:
            print(f"Failed on model {model_name}, pid {record.get('pid', 'unknown')}: {e}, modify_list_json: {modify_list_json}")
            record[perturb_key] = None




In [45]:
async def apply_perturbations_to_all_responses(responses):
    tasks = []

    for record in responses:
        gt_label = record.get("output_label")
        if not gt_label:
            continue  # Skip if ground truth is missing

        # Get each model's predicted label
        model_labels = {
            model: record.get(f"{model}_output_label")
            for model in target_models
        }

        # Only proceed if any model prediction is incorrect
        if any(label != gt_label for label in model_labels.values() if label is not None):
            for model in target_models:
                perturb_key = f"{model}_reason_perturb_llm_auto"
                if record.get(perturb_key) is None:  # Only if not already processed
                    tasks.append(process_reason_perturbation(record, model))

    for task in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Perturbing reasons"):
        await task

    return responses


In [46]:
responses = await apply_perturbations_to_all_responses(responses)

Perturbing reasons: 100%|██████████| 70/70 [00:19<00:00,  3.68it/s]


In [49]:
with open(".\quality\quality_responses.json", "w") as f:
    json.dump(responses, f, indent=4)  # indent=4 makes it more readable

## selecting with heurestics (selects common words)

In [186]:
stop_words = {
    "the", "his", "her", "an", "a", "this", "on", "is", "of", "and", "to", "in", "that", "it", 
    "with", "as", "for", "was", "were", "be", "by", "at", "or", "which", "from", "but", "not"
}

In [190]:
num_tries = []

def get_candidate_words(sentence, question, n_words=2):
    global num_tries
    candidate_words = []
    tries = 0

    sentence_words = sentence.split()
    question_words = set(question.lower().split())

    while len(candidate_words) < n_words and tries < 1000:
        word = random.choice(sentence_words)
        tries += 1

        # Normalize to lowercase for comparison
        clean_word = word.lower().strip(".,!?\"'()")

        if clean_word not in stop_words and clean_word not in question_words and clean_word not in candidate_words:
            candidate_words.append(clean_word)

    num_tries.append(tries)
    return candidate_words


In [None]:
SYNONYM_SUGGESTER_LIMITED_PROMPT_TEMPLATE = """You are a helpful assistant that helps rewrites sentences. 
Given are two words in the answer that you need to suggest replacement with their synonyms. 
Make sure the suggested words do not change the meaning of the answer. 

### System Output Format:
Respond in **JSON format** with:
- `"replacements"`: The list of replacement words in the same order.

### Question:
{question}

### Answer:
{answer}

### Selected Words:
{selected_words}

### Expected Response Format:
```
{{
  "replacements": "[replacement1, replacement2]"
}}
"""

In [195]:
nest_asyncio.apply()
async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Global: concurrency limiter
qa_semaphore = asyncio.Semaphore(10)

async def get_synonym_from_list_answer(model_name, answer, question, selected_words):
    async with qa_semaphore:
        prompt = SYNONYM_SUGGESTER_LIMITED_PROMPT_TEMPLATE.format(
             question=question, answer=answer,selected_words=selected_words
        )
        exact_model = format_model_name_together(model_name)

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "user", "content": prompt},
                ],
                temperature=0.0
            )

            return response.choices[0].message.content

        except Exception as e:
            print(f"Failed QA comparison call for model {model_name}: {e}")
            return None


In [196]:
words_to_replace = get_candidate_words(record3['Qwen2.5-7B-Instruct-Turbo_reason'], record3['questions'])

In [197]:
modify_list_json = await get_synonym_from_list_answer("meta-llama/Llama-3.3-70B-Instruct-Turbo", record3['Qwen2.5-7B-Instruct-Turbo_reason'] , record3['questions'], words_to_replace)
modify_list_json

'{\n  "replacements": ["cooking", "indicated"]\n}'

In [198]:
def apply_reason_modifications_given_list(reason_text, words_to_replace, modify_list_json):
    modified_text = reason_text
    modify_list_json = json.loads(modify_list_json)
    selected_words = words_to_replace
    replacements = modify_list_json.get("replacements", [])

    for original, replacement in zip(selected_words, replacements):
        if original and replacement:
            modified_text = modified_text.replace(original, replacement)

    return modified_text


In [199]:
modified_answer3 = apply_reason_modifications_given_list(record3['Qwen2.5-7B-Instruct-Turbo_reason'], words_to_replace, modify_list_json)
print(record3['questions'])
print('Original    Reason:', record3['Qwen2.5-7B-Instruct-Turbo_reason'])
print('Initial    synonym:', record3['Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta'])
print('Heurestics synonym:',modified_answer3)

Sabrina York is 

 (A) a criminal that Blake is hunting
 (B) a psycheye that taught Blake all the tricks
 (C) an old friend of Blake's
 (D) Eldoria's alter ego
Original    Reason: The text mentions that Sabrina York's nameplate on the kitchen range read 'Sabrina York', indicating she is Eldoria's alter ego.
Initial    synonym: The text mentions that Sabra New York nameplate on the kitchen range read New York indicating she is Eldoria alter ego
Heurestics synonym: The text mentions that Sabrina York's nameplate on the cooking range indicated 'Sabrina York', indicating she is Eldoria's alter ego.


### Generate Report 50 samples

In [201]:
def get_first_50_valid_records(responses):
    selected = []
    count = 0

    for record in responses:
        if 'Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta' not in record:
            continue

        selected.append({
            'pid': record.get('pid'),
            'questions': record.get('questions'),
            'Qwen2.5-7B-Instruct-Turbo_reason': record.get('Qwen2.5-7B-Instruct-Turbo_reason'),
            'Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta': record.get('Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta')
        })

        count += 1
        if count == 50:
            break

    return selected


In [None]:
first_50 = get_first_50_valid_records(responses)
first_50

In [204]:
async def add_llm_perturbations(first_50, model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo"):
    for record in tqdm(first_50, desc="Generating LLM perturbations"):
        original_reason = record['Qwen2.5-7B-Instruct-Turbo_reason']
        question = record['questions']

        # === 1. LLM-AUTO ===
        try:
            modify_list_json = await get_synonym_list_answer(model_name, original_reason, question)
            modified_answer1 = apply_reason_modifications(original_reason, modify_list_json)
            record['Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto'] = modified_answer1
        except Exception as e:
            print(f"[AUTO] Failed for pid {record['pid']}: {e}")
            record['Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto'] = None

        # === 2. LLM-HEURISTICS ===
        try:
            words_to_replace = get_candidate_words(original_reason, question, n_words=3)
            replacements = await get_synonym_from_list_answer(model_name, original_reason, question, words_to_replace)
            modified_answer2 = apply_reason_modifications_given_list(original_reason, words_to_replace, replacements)
            record['Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics'] = modified_answer2
        except Exception as e:
            print(f"[HEUR] Failed for pid {record['pid']}: {e}")
            record['Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics'] = None

    return first_50


In [205]:
first_50_augmented = await add_llm_perturbations(first_50)


Generating LLM perturbations:  16%|█▌        | 8/50 [00:34<01:54,  2.72s/it]

[HEUR] Failed for pid 30029_F5N22U40_7_0: Expecting value: line 1 column 1 (char 0)
[AUTO] Failed for pid 30029_F5N22U40_8_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  20%|██        | 10/50 [00:41<02:12,  3.32s/it]

[HEUR] Failed for pid 62139_J05FWZR6_3_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  26%|██▌       | 13/50 [00:50<02:08,  3.46s/it]

[HEUR] Failed for pid 62139_J05FWZR6_7_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  34%|███▍      | 17/50 [00:58<01:23,  2.53s/it]

[HEUR] Failed for pid 63523_STSHLFEA_3_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  42%|████▏     | 21/50 [01:12<02:00,  4.16s/it]

[HEUR] Failed for pid 63523_STSHLFEA_9_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  44%|████▍     | 22/50 [01:13<01:29,  3.20s/it]

[HEUR] Failed for pid 63523_STSHLFEA_10_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  50%|█████     | 25/50 [01:21<01:01,  2.45s/it]

[AUTO] Failed for pid 63401_ZCP5ZDGL_6_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  56%|█████▌    | 28/50 [01:24<00:36,  1.68s/it]

[HEUR] Failed for pid 63401_ZCP5ZDGL_8_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  60%|██████    | 30/50 [01:27<00:30,  1.50s/it]

[HEUR] Failed for pid 63401_ZCP5ZDGL_10_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  64%|██████▍   | 32/50 [01:39<01:09,  3.88s/it]

[AUTO] Failed for pid 62476_Z8GFDCIZ_3_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  66%|██████▌   | 33/50 [01:40<00:55,  3.27s/it]

[HEUR] Failed for pid 62476_Z8GFDCIZ_3_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  70%|███████   | 35/50 [01:43<00:36,  2.43s/it]

[HEUR] Failed for pid 62476_Z8GFDCIZ_5_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  80%|████████  | 40/50 [01:58<00:31,  3.17s/it]

[HEUR] Failed for pid 52845_91NAQ9LY_1_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  86%|████████▌ | 43/50 [02:03<00:14,  2.13s/it]

[HEUR] Failed for pid 52845_91NAQ9LY_5_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  90%|█████████ | 45/50 [02:06<00:08,  1.78s/it]

[HEUR] Failed for pid 52845_91NAQ9LY_7_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  96%|█████████▌| 48/50 [02:10<00:02,  1.47s/it]

[HEUR] Failed for pid 30029_XQTTOPHP_3_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations:  98%|█████████▊| 49/50 [02:12<00:01,  1.56s/it]

[HEUR] Failed for pid 30029_XQTTOPHP_4_0: Expecting value: line 1 column 1 (char 0)


Generating LLM perturbations: 100%|██████████| 50/50 [02:14<00:00,  2.69s/it]


In [207]:
len(first_50_augmented)

50

In [208]:
def filter_complete_perturbations(records):
    required_keys = [
        'Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto',
        'Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics'
    ]

    cleaned = [
        record for record in records
        if all(key in record and record[key] is not None for key in required_keys)
    ]

    return cleaned


In [211]:
cleaned_records = filter_complete_perturbations(first_50_augmented)
print(f"Retained {len(cleaned_records)} fully-processed records.")

Retained 33 fully-processed records.


In [223]:
from sentence_transformers import SentenceTransformer, util

# Load model once
model = SentenceTransformer("all-MiniLM-L6-v2")

def add_similarity_scores(records):
    for record in records:
        try:
            base_reason = record["Qwen2.5-7B-Instruct-Turbo_reason"]
            initial_perturb = record["Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta"]
            auto_perturb = record["Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto"]
            heur_perturb = record["Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics"]

            embeddings = model.encode([base_reason, auto_perturb, heur_perturb, initial_perturb])

            # Similarity between base and auto perturbation
            sim_auto = util.cos_sim(embeddings[0], embeddings[1]).item()
            # Similarity between base and heuristics perturbation
            sim_heur = util.cos_sim(embeddings[0], embeddings[2]).item()
            # similarity between base and initial perturbation
            sim_initial = util.cos_sim(embeddings[0], embeddings[3]).item()
            
            record["Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto_similarity"] = sim_auto
            record["Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics_similarity"] = sim_heur
            record["Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_initial_similarity"] = sim_initial
        except Exception as e:
            print(f"Similarity computation failed for pid {record.get('pid')}: {e}")

    return records


In [224]:
scored_records = add_similarity_scores(cleaned_records)


In [231]:
import pandas as pd
def convert_to_df(scored_records):
    rows = []

    for record in scored_records:
        rows.append({
            "pid": record.get("pid"),
            "question": record.get("questions"),
            "original": record.get("Qwen2.5-7B-Instruct-Turbo_reason"),
            "auto_perturb": record.get("Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto"),
            "auto_pertub_similarity": record.get("Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_auto_similarity"),
            "heurestics": record.get("Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics"),
            "heurestics_similarity": record.get("Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_heurestics_similarity"),
            "initial": record.get("Qwen2.5-7B-Instruct-Turbo_reason_perturb2_meta"),
            "initial_similarity": record.get("Qwen2.5-7B-Instruct-Turbo_reason_perturb_llm_initial_similarity")
        })

    df = pd.DataFrame(rows, columns=[
        'pid', 'question', 'original', 'auto_perturb', 
        'auto_pertub_similarity', 'heurestics', 'heurestics_similarity', 'initial', 'initial_similarity'
    ])
    return df


In [235]:
df = convert_to_df(scored_records)
df.to_csv(".\quality\perturbation_similarity_results.csv", index=False)
print("CSV saved as 'perturbation_similarity_results.csv'")


CSV saved as 'perturbation_similarity_results.csv'


In [222]:
print("Number of tries to get candidate words:", num_tries)
print("Average number of tries:", sum(num_tries) / len(num_tries) if num_tries else 0)

Number of tries to get candidate words: [2, 7, 8, 8, 4, 6, 16, 9, 7, 10, 18, 9, 10, 13, 7, 7, 11, 22, 16, 12, 13, 16, 3, 9, 6, 5, 12, 13, 7, 11, 72, 8, 3, 12, 8, 8, 25, 6, 11, 6, 10, 12, 6, 10, 4, 5, 10, 16, 7, 8, 4, 7]
Average number of tries: 10.673076923076923


## Generate Paraphrasing in Async

In [107]:
with open(".\quality\quality_responses.json", 'r') as file:
    responses = json.load(file)

In [122]:
import asyncio
from tqdm.asyncio import tqdm_asyncio

# Models involved
target_models = [
    "Qwen2.5-7B-Instruct-Turbo",
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "DeepSeek-V3",
    "Llama-4-Scout-17B-16E-Instruct",
    "Llama-4-Maverick-17B-128E-Instruct-FP8"
]

# Optional: limit parallel API calls
paraphrase_semaphore = asyncio.Semaphore(10)

# Async version of paraphrasing function
async def paraphrase_reasoning_async(reasoning, model_name):
    async with paraphrase_semaphore:
        exact_model = format_model_name_together(model_name)

        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that paraphrases a sentence while preserving context and meaning. You paraphrase the sentence(s) given, and only reply with the paraphrased sentence and no other text."},
                    {"role": "user", "content": f'This is a sentence which explains the reasoning behind an answer to a question. Your response is strictly the new paraphrased reasoning. The sentence is: {reasoning}'}
                ]
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"Paraphrasing failed with {model_name}: {e}")
            return None

def create_paraphrase_task(record, model_name, paraphrasing_model, reason):
    async def run():
        paraphrased_reason = await paraphrase_reasoning_async(reason, paraphrasing_model)
        key = f"{model_name}_reason_paraphrased_{paraphrasing_model}"
        record[key] = paraphrased_reason
    return run()


# One record → all its paraphrase tasks
async def process_record_paraphrases(record):
    tasks = []
    gt_label = record.get("output_label")

    if not gt_label:
        return

    model_labels = {
        model: record.get(f"{model}_output_label")
        for model in target_models
    }

    if not any(label and label != gt_label for label in model_labels.values()):
        return

    for model_name in target_models:
        reason_key = f"{model_name}_reason"
        reason = record.get(reason_key)
        if not reason:
            continue

        other_models = [m for m in target_models if m != model_name]

        for paraphrasing_model in other_models:
            paraphrase_key = f"{model_name}_reason_paraphrased_{paraphrasing_model}"
            if record.get(paraphrase_key) is not None:
                continue

            # ✅ Create task with proper variable binding
            tasks.append(create_paraphrase_task(record, model_name, paraphrasing_model, reason))

    await asyncio.gather(*tasks)


In [None]:
# Main async runner
async def run_all_paraphrasing(responses):
    await tqdm_asyncio.gather(*[process_record_paraphrases(r) for r in responses])

In [None]:
await run_all_paraphrasing(responses)


In [127]:
with open(".\quality\quality_responses.json", "w") as f:
    json.dump(responses, f, indent=4)  # indent=4 makes it more readable

## Generate 'Dupe' Answer

In [11]:
ANSWER_SELECTION_DUPE_PROMPT_TEMPLATE = """You are an AI system designed to analyze questions and their corresponding text passages to determine the reason behind an answer.

### System Output Format:
Respond in **JSON format** with:
- `"reason"`: A short explanation of why this answer is correct.

### Task:
Given the **question**, **text** and the **correct_answer** below, generate a brief justification reason for the given answer.

#### Question:
{questions}

#### Text:
{text}

#### Correct Answer:
{correct_answer}

### Expected Response Format:
```
{{
  "reason": "The text states that the event lasted 10 hours, which corresponds to option B."
}}"""


In [12]:
def extract_braces_content(s):
    match = re.search(r'\{(.*?)\}', s, re.DOTALL)
    return match.group(0) if match else ""

## Get Selective Dupes

In [103]:
with open('.\quality\quality_responses.json', 'r') as file:
    responses = json.load(file)

In [105]:
nest_asyncio.apply()

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Limit number of concurrent API calls
semaphore = asyncio.Semaphore(20)  # <-- Set to 10, 20, or 50 depending on your system and Together API limits

failed_records_id = []
failed_records = []

async def process_record_dupe(record, model_name, alternative_models):
    async with semaphore:  # Throttle concurrent calls
        questions = record.get("questions", "")
        text = record.get("text", "")
        current_label = record.get(model_name + "_output_label")

        if not questions or not text or not current_label:
            return None  # Skip invalid

        # Step 1: Find the first alternate label that differs from GT
        alternate_output_label = None
        alternate_found = False
        for alt_model in alternative_models:
            alt_label = record.get(f"{alt_model}_output_label")
            if alt_label and alt_label != current_label and alternate_found is False:
                alternate_output_label = alt_label
                
        # print(f"Record ID: {record.get('pid')}, Actual:{current_label} Alternate Output Label: {alternate_output_label}")
        
        if not alternate_output_label:
            return None  # All models agree with GT → skip

        # Step 2: Format prompt
        prompt = ANSWER_SELECTION_DUPE_PROMPT_TEMPLATE.format(
            questions=questions,
            text=text,
            correct_answer=alternate_output_label
        )
        exact_model = format_model_name_together(model_name)

        # Step 3: Call the model
        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            api_response = response.choices[0].message.content
            api_response = extract_braces_content(api_response)

            key_name_output = model_name + "_Dupe_output_label"
            key_name_reason = model_name + "_Dupe_reason"

            response_json = fix_json_response(api_response)
            record[key_name_output] = alternate_output_label
            record[key_name_reason] = response_json.get("reason")

            return record

        except Exception as e:
            failed_records.append(str(e))
            failed_records_id.append(record.get("pid"))
            return None


async def generate_dupe_answer_selection_quality_async(model_name, alternative_models, start_index, end_index, processed_data, repeat_failures=False):
    tasks = []
    for record in processed_data[start_index:end_index]:
        # If repeat_failures is True, only process the record if its 'pid' is in failed_records_id.
        if repeat_failures and record.get("pid") not in failed_records_id:
            continue
        tasks.append(process_record_dupe(record, model_name, alternative_models))
    
    results = []
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Processing Records"):
        result = await future
        if result:
            results.append(result)
            
    return results, failed_records_id, failed_records



In [106]:
model_name = "Llama-4-Scout-17B-16E-Instruct"
alterntive_models = ["DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo"]

results, failed_ids, failed_content = asyncio.run(generate_dupe_answer_selection_quality_async(model_name, alterntive_models,0, len(responses), responses))

Processing Records: 100%|██████████| 2086/2086 [03:16<00:00, 10.60it/s]


In [110]:
dupe_label_count = 0

for record in responses:
    for key in record.keys():
        if key.endswith("_Dupe_output_label") and record[key] is not None:
            dupe_label_count += 1

print(f"Total _Dupe_output_label fields found: {dupe_label_count}")


Total _Dupe_output_label fields found: 1591


In [117]:
model_name = "Llama-4-Maverick-17B-128E-Instruct-FP8"
alterntive_models = ["DeepSeek-V3",  "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo"]

results, failed_ids, failed_content = asyncio.run(generate_dupe_answer_selection_quality_async(model_name, alterntive_models,0, len(responses), responses))

Processing Records: 100%|██████████| 2086/2086 [12:31<00:00,  2.78it/s] 


In [119]:
model_name = "DeepSeek-V3"
alterntive_models = ["Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo"]

results, failed_ids, failed_content = asyncio.run(generate_dupe_answer_selection_quality_async(model_name, alterntive_models,0, len(responses), responses))

Processing Records: 100%|██████████| 2086/2086 [08:40<00:00,  4.01it/s]


In [123]:
model_name = "Qwen2.5-7B-Instruct-Turbo"
alterntive_models = ["DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo"]

results, failed_ids, failed_content = asyncio.run(generate_dupe_answer_selection_quality_async(model_name, alterntive_models,0, len(responses), responses))

Processing Records: 100%|██████████| 2086/2086 [02:05<00:00, 16.62it/s]


In [124]:
model_name = "Meta-Llama-3.1-8B-Instruct-Turbo"
alterntive_models = ["DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo"]

results, failed_ids, failed_content = asyncio.run(generate_dupe_answer_selection_quality_async(model_name, alterntive_models,0, len(responses), responses))

Processing Records: 100%|██████████| 2086/2086 [05:06<00:00,  6.81it/s]


In [125]:
with open(".\quality\quality_responses.json", "w") as f:
    json.dump(responses, f, indent=4)  # indent=4 makes it more readable

In [6]:
## Fix Format to include output_label in the Dupe_reason
# Define model list
models = [
    "DeepSeek-V3", 
    "Llama-4-Maverick-17B-128E-Instruct-FP8", 
    "Llama-4-Scout-17B-16E-Instruct", 
    "Qwen2.5-7B-Instruct-Turbo", 
    "Meta-Llama-3.1-8B-Instruct-Turbo"
]

# Process the responses
for record in responses:
    for model in models:
        label_key = f"{model}_Dupe_output_label"
        reason_key = f"{model}_Dupe_reason"
        if label_key in record and reason_key in record:
            output_label = record[label_key]
            if output_label:
                new_key = f"{reason_key}_output_label_{output_label}"
                record[new_key] = record[reason_key]


## Generate Dupes for all combinations (where one is wrong)

In [16]:
nest_asyncio.apply()

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Limit number of concurrent API calls
semaphore = asyncio.Semaphore(20)  # <-- Set to 10, 20, or 50 depending on your system and Together API limits

failed_records_id = []
failed_comparisons = failed_records = []

async def process_record_dupe(record, model_name, model_2):
    async with semaphore:  # Throttle concurrent calls
        questions = record.get("questions", "")
        text = record.get("text", "")
        current_label = record.get(model_name + "_output_label")

        if not questions or not text or not current_label:
            return None  # Skip invalid

        # Step 1: Find the first alternate label that differs from GT
        required_label = record.get(model_2 + "_output_label")
        if not required_label:
            return None  # All models agree with GT → skip

        # Step 2: Format prompt
        prompt = ANSWER_SELECTION_DUPE_PROMPT_TEMPLATE.format(
            questions=questions,
            text=text,
            correct_answer=required_label
        )
        exact_model = format_model_name_together(model_name)

        # Step 3: Call the model
        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            api_response = response.choices[0].message.content
            api_response = extract_braces_content(api_response)

            key_name_reason = model_name + "_Dupe_reason_output_label_" + required_label
            response_json = fix_json_response(api_response)
            record[key_name_reason] = response_json.get("reason")

            return record

        except Exception as e:
            failed_records.append(str(e))
            failed_records_id.append(record.get("pid"))
            return None


In [14]:
async def generate_dupe_all_combinations(evaluator_model, evaluatee_model, records, harmful_subset, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_records_id:
                continue  # Only retry known failed records
        if model1_label is None or model2_label is None:
            continue
        key_name_reason = model1 + "_Dupe_reason_output_label_" + model2_label
        if key_name_reason in record:
            continue
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(process_record_dupe(record, model1, model2))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(process_record_dupe(record, model1, model2))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Evaluating Preferences"):
        await future


#### Harmful

In [17]:
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
######
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 0it [00:00, ?it/s]
Evaluating Preferences: 100%|██████████| 45/45 [00:06<00:00,  7.34it/s]
Evaluating Preferences: 100%|██████████| 26/26 [00:03<00:00,  8.33it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [18]:
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 154/154 [00:11<00:00, 13.07it/s]
Evaluating Preferences: 100%|██████████| 31/31 [00:02<00:00, 10.82it/s]
Evaluating Preferences: 0it [00:00, ?it/s]
Evaluating Preferences: 100%|██████████| 23/23 [00:03<00:00,  6.96it/s]


In [19]:
await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 74/74 [00:26<00:00,  2.79it/s]
Evaluating Preferences: 100%|██████████| 28/28 [00:10<00:00,  2.67it/s]
Evaluating Preferences: 100%|██████████| 32/32 [00:13<00:00,  2.39it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [20]:
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 177/177 [00:15<00:00, 11.40it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
Evaluating Preferences: 100%|██████████| 39/39 [00:03<00:00, 11.68it/s]
Evaluating Preferences: 100%|██████████| 91/91 [00:08<00:00, 11.06it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


In [21]:
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 176/176 [01:59<00:00,  1.47it/s]
Evaluating Preferences: 100%|██████████| 43/43 [00:19<00:00,  2.19it/s]
Evaluating Preferences: 100%|██████████| 92/92 [00:33<00:00,  2.76it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


#### Beneificail quadrant

In [22]:
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
######
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 51/51 [00:05<00:00,  9.10it/s]
Evaluating Preferences: 100%|██████████| 56/56 [00:06<00:00,  8.24it/s]
Evaluating Preferences: 100%|██████████| 95/95 [00:11<00:00,  8.08it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]


In [23]:
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []


Evaluating Preferences: 100%|██████████| 64/64 [00:07<00:00,  9.06it/s]
Evaluating Preferences: 100%|██████████| 139/139 [00:14<00:00,  9.75it/s]
Evaluating Preferences: 0it [00:00, ?it/s]
Evaluating Preferences: 100%|██████████| 71/71 [00:07<00:00,  8.89it/s]


In [24]:
await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 148/148 [01:06<00:00,  2.24it/s]
Evaluating Preferences: 100%|██████████| 115/115 [00:53<00:00,  2.15it/s]
Evaluating Preferences: 100%|██████████| 112/112 [00:54<00:00,  2.06it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [25]:
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Evaluating Preferences: 100%|██████████| 77/77 [00:06<00:00, 11.98it/s]
Evaluating Preferences: 100%|██████████| 49/49 [00:04<00:00, 10.94it/s]
Evaluating Preferences: 100%|██████████| 28/28 [00:02<00:00,  9.84it/s]
Evaluating Preferences: 100%|██████████| 2/2 [00:01<00:00,  1.94it/s]


In [26]:
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_dupe_all_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
    

Evaluating Preferences: 100%|██████████| 3/3 [00:01<00:00,  2.65it/s]
Evaluating Preferences: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
Evaluating Preferences: 100%|██████████| 3/3 [00:01<00:00,  2.13it/s]
Evaluating Preferences: 0it [00:00, ?it/s]


In [27]:
with open(".\quality\quality_responses.json", "w") as f:
    json.dump(responses, f, indent=4)  # indent=4 makes it more readable

## Generate Paraphrasing External with Rules

In [84]:
PARAPHRASER_PROMPT_TEMPLATE = """You are a helpful assistant that helps rewrites sentences. 
Paraphrase the answer to the question below, writing in a manner you would express the answer but without changing the semantics or meaning of the answer. 
You are not allowed to replace words that appear in the list of fixed words, but you can change their ordering. These words help maintain the meaning of the answer.

### System Output Format:
Respond in **JSON format** with:
- `"paraphrased_answer"`: The answer to the question, paraphrased.


### Question:
{question}

### Answer:
{answer}

### Fixed Words:
{fixed_words}

### Expected Response Format:
```
{{
  "paraphrased_answer": "another paraphrased version of the answer",
}}
"""

In [85]:

def extract_braces_content(s):
    match = re.search(r'\{(.*?)\}', s, re.DOTALL)
    return match.group(0) if match else ""

In [86]:
def overlap_non_stop_words(question, response):
    stop_words = {
        "the", "his", "her", "an", "a", "this", "on", "is", "of", "and", "to", "in", "that", "it", 
        "with", "as", "for", "was", "were", "be", "by", "at", "or", "which", "from", "but", "not"
    }

    # Normalize and tokenize input
    question_words = set(word.lower() for word in question.split())
    response_words = [word.lower() for word in response.split()]

    # Return words that are not stop words and exist in question
    return [word for word in response_words if word not in stop_words and word in question_words]


In [111]:
nest_asyncio.apply()

async_client = AsyncTogether(api_key=os.environ.get("TOGETHER_API_KEY"))

# Limit number of concurrent API calls
semaphore = asyncio.Semaphore(20)  # <-- Set to 10, 20, or 50 depending on your system and Together API limits

failed_comparisons = []

async def process_record_paraphrase(record, model_name, model_2, paraphraser='meta-llama/Llama-3.3-70B-Instruct-Turbo'):
    async with semaphore:  # Throttle concurrent calls
        questions = record.get("questions", "")
        text = record.get("text", "")
        current_label = record.get(model_name + "_output_label")
        response = record.get(model_name+"_reason")
        fixed_words = overlap_non_stop_words(questions, response)
        
        if not questions or not text or not current_label:
            return None  # Skip invalid


        prompt = PARAPHRASER_PROMPT_TEMPLATE.format(
            question=questions,
            answer=response,
            fixed_words=fixed_words
        )
        # use the paraphraser model to generate the paraphrased answer
        exact_model = format_model_name_together(paraphraser)

        # Step 3: Call the model
        try:
            response = await async_client.chat.completions.create(
                model=exact_model,
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"}
            )
            api_response = response.choices[0].message.content
            # api_response = extract_braces_content(api_response)

            key_name_reason = model_name + "_reason_paraphrased_external" 
            #response_json = fix_json_response(api_response)
            response_json = json.loads(api_response)
            record[key_name_reason] = response_json.get("paraphrased_answer")

            return record

        except Exception as e:
            # print(f"Failed QA comparison call for model {model_name}: {e}")
            failed_comparisons.append(record.get("pid"))
            return None


In [99]:
async def generate_paraphrase_external_combinations(evaluator_model, evaluatee_model, records, harmful_subset, repeat_failures=False):
    model1 = evaluator_model
    model2 = evaluatee_model

    tasks = []

    for record in records:
        pid = record.get('pid')
        gt_label = record['output_label']
        model1_label = record.get(model1 + '_output_label')
        model2_label = record.get(model2 + '_output_label')
        if repeat_failures:
            if pid not in failed_comparisons:
                continue  # Only retry known failed records
        if model1_label is None or model2_label is None:
            continue
        key_name_reason = model1 + "_reason_paraphrased_external" 
        if key_name_reason in record:
            continue
        # Only compare if model1 is wrong and model2 is right
        if harmful_subset:
            if model1_label and model2_label and model1_label != gt_label and model2_label == gt_label:
                tasks.append(process_record_paraphrase(record, model1, model2))
        else:
            if model1_label and model2_label and model1_label == gt_label and model2_label != gt_label:
                tasks.append(process_record_paraphrase(record, model1, model2))
    for future in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Generating..."):
        await future


In [100]:
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######  
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Generating...: 100%|██████████| 461/461 [08:28<00:00,  1.10s/it]
Generating...: 100%|██████████| 317/317 [06:33<00:00,  1.24s/it]
Generating...: 100%|██████████| 202/202 [04:55<00:00,  1.46s/it]
Generating...: 100%|██████████| 140/140 [03:57<00:00,  1.70s/it]
Generating...: 100%|██████████| 140/140 [02:45<00:00,  1.18s/it]
Generating...: 100%|██████████| 79/79 [02:35<00:00,  1.97s/it]
Generating...: 100%|██████████| 113/113 [03:47<00:00,  2.01s/it]
Generating...: 100%|██████████| 61/61 [02:30<00:00,  2.47s/it]


In [None]:
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []


In [112]:
await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Generating...: 100%|██████████| 167/167 [04:06<00:00,  1.47s/it]
Generating...: 100%|██████████| 67/67 [02:43<00:00,  2.44s/it]
Generating...: 100%|██████████| 83/83 [03:56<00:00,  2.85s/it]
Generating...: 100%|██████████| 50/50 [01:23<00:00,  1.68s/it]
Generating...: 100%|██████████| 93/93 [02:26<00:00,  1.58s/it]
Generating...: 100%|██████████| 51/51 [01:21<00:00,  1.60s/it]
Generating...: 100%|██████████| 82/82 [01:22<00:00,  1.00s/it]
Generating...: 100%|██████████| 44/44 [00:51<00:00,  1.16s/it]


In [113]:
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []

Generating...: 100%|██████████| 509/509 [05:24<00:00,  1.57it/s] 
Generating...: 100%|██████████| 286/286 [05:34<00:00,  1.17s/it]
Generating...: 100%|██████████| 173/173 [12:59<00:00,  4.50s/it] 
Generating...: 100%|██████████| 103/103 [04:11<00:00,  2.44s/it]
Generating...: 100%|██████████| 273/273 [07:13<00:00,  1.59s/it]
Generating...: 100%|██████████| 160/160 [02:34<00:00,  1.04it/s]
Generating...: 100%|██████████| 111/111 [03:22<00:00,  1.82s/it]
Generating...: 100%|██████████| 72/72 [03:02<00:00,  2.53s/it]


In [114]:
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=True, repeat_failures=True)
    failed_comparisons = []
    

Generating...: 100%|██████████| 575/575 [17:10<00:00,  1.79s/it]
Generating...: 100%|██████████| 347/347 [12:12<00:00,  2.11s/it]
Generating...: 100%|██████████| 266/266 [09:30<00:00,  2.15s/it]
Generating...: 100%|██████████| 157/157 [03:54<00:00,  1.50s/it]
Generating...: 100%|██████████| 329/329 [04:53<00:00,  1.12it/s]
Generating...: 100%|██████████| 209/209 [03:36<00:00,  1.03s/it]
Generating...: 100%|██████████| 134/134 [01:58<00:00,  1.13it/s]
Generating...: 100%|██████████| 93/93 [01:32<00:00,  1.01it/s]


In [117]:
with open(".\quality\quality_responses.json", "w") as f:
    json.dump(responses, f, indent=4)  # indent=4 makes it more readable

### Beneficial

In [165]:
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######  
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Scout-17B-16E-Instruct", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Generating...: 100%|██████████| 140/140 [02:47<00:00,  1.20s/it]
Generating...: 100%|██████████| 55/55 [01:17<00:00,  1.41s/it]
Generating...: 100%|██████████| 130/130 [02:32<00:00,  1.18s/it]
Generating...: 100%|██████████| 49/49 [01:31<00:00,  1.87s/it]
Generating...: 100%|██████████| 297/297 [03:06<00:00,  1.59it/s]
Generating...: 100%|██████████| 122/122 [01:24<00:00,  1.44it/s]
Generating...: 100%|██████████| 246/246 [01:36<00:00,  2.56it/s]
Generating...: 100%|██████████| 105/105 [00:58<00:00,  1.80it/s]


In [166]:
await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("DeepSeek-V3", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Generating...: 100%|██████████| 353/353 [02:04<00:00,  2.83it/s]
Generating...: 100%|██████████| 174/174 [01:25<00:00,  2.04it/s]
Generating...: 100%|██████████| 305/305 [02:15<00:00,  2.25it/s]
Generating...: 100%|██████████| 164/164 [01:36<00:00,  1.69it/s]
Generating...: 100%|██████████| 356/356 [02:33<00:00,  2.32it/s]
Generating...: 100%|██████████| 196/196 [01:46<00:00,  1.84it/s]
Generating...: 100%|██████████| 316/316 [02:11<00:00,  2.40it/s]
Generating...: 100%|██████████| 197/197 [02:30<00:00,  1.31it/s]


In [167]:
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Llama-4-Maverick-17B-128E-Instruct-FP8", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []


Generating...: 100%|██████████| 167/167 [01:09<00:00,  2.39it/s]
Generating...: 100%|██████████| 78/78 [00:58<00:00,  1.32it/s]
Generating...: 100%|██████████| 438/438 [05:36<00:00,  1.30it/s]
Generating...: 100%|██████████| 211/211 [03:55<00:00,  1.12s/it]
Generating...: 100%|██████████| 342/342 [08:19<00:00,  1.46s/it]
Generating...: 100%|██████████| 196/196 [03:07<00:00,  1.05it/s]
Generating...: 100%|██████████| 110/110 [01:35<00:00,  1.15it/s]
Generating...: 100%|██████████| 77/77 [01:21<00:00,  1.06s/it]


In [168]:
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Qwen2.5-7B-Instruct-Turbo", "Meta-Llama-3.1-8B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []

Generating...: 100%|██████████| 226/226 [01:56<00:00,  1.94it/s]
Generating...: 100%|██████████| 91/91 [00:56<00:00,  1.61it/s]
Generating...: 100%|██████████| 146/146 [01:21<00:00,  1.78it/s]
Generating...: 100%|██████████| 69/69 [00:53<00:00,  1.30it/s]
Generating...: 100%|██████████| 51/51 [00:45<00:00,  1.13it/s]
Generating...: 100%|██████████| 33/33 [00:45<00:00,  1.37s/it]
Generating...: 100%|██████████| 229/229 [02:28<00:00,  1.54it/s]
Generating...: 100%|██████████| 112/112 [01:11<00:00,  1.56it/s]


In [169]:
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Maverick-17B-128E-Instruct-FP8", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Llama-4-Scout-17B-16E-Instruct", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "DeepSeek-V3", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
#######
await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False)
if (len(failed_comparisons) > 0):
    await generate_paraphrase_external_combinations("Meta-Llama-3.1-8B-Instruct-Turbo", "Qwen2.5-7B-Instruct-Turbo", responses, harmful_subset=False, repeat_failures=True)
    failed_comparisons = []
    

Generating...: 100%|██████████| 200/200 [01:59<00:00,  1.67it/s]
Generating...: 100%|██████████| 94/94 [01:36<00:00,  1.03s/it]
Generating...: 100%|██████████| 149/149 [01:26<00:00,  1.72it/s]
Generating...: 100%|██████████| 81/81 [01:16<00:00,  1.05it/s]
Generating...: 100%|██████████| 70/70 [01:06<00:00,  1.05it/s]
Generating...: 100%|██████████| 45/45 [01:07<00:00,  1.51s/it]
Generating...: 100%|██████████| 164/164 [01:38<00:00,  1.67it/s]
Generating...: 100%|██████████| 91/91 [00:51<00:00,  1.78it/s]


In [170]:
with open(".\quality\quality_responses.json", "w") as f:
    json.dump(responses, f, indent=4)  # indent=4 makes it more readable