In [1]:
import re
import asyncio
import os
from dotenv import load_dotenv
from openai import AsyncOpenAI
from openai import OpenAI
from tqdm.asyncio import tqdm_asyncio
import json



MODEL_NAME_1 = "qwen-3b-raw"

MODEL_NAME_2 = "qwen-3b-raft"


JUDGE_NAME = "meta-llama/Llama-3.3-70B-Instruct"






def get_judge_prompt(question, relevant_context, reference_answer, model_answer_1, model_answer_2):
    messages = []

    messages.append({"role": "system", "content": "You are a helpful assistant that two side by side RAG answers against the ground truth. Always respond with a JSON object "
                                                "containing a 'comment' (string), 'winner' (number 0-2) fields."})

    judge_prompt = f"""
    **Question**: {question}

    **Relevant context**: {relevant_context}

    **Reference Answer**: {reference_answer}

    **Model Answer 1**: {model_answer_1}

    **Model Answer 2**: {model_answer_2}

    
    **Evaluation Rules**:
    - Compare Answer A and Answer B side by side against the reference.
    - Decide which is better overall, or state if they are equally good (if that so winner=0).
    - Briefly justify your choice (2–3 sentences)

    Provide analysis in JSON format: {{"comment": "...", "winner": 1-2}}"""

    messages.append({"role": "user", "content": judge_prompt})

    return messages

In [None]:
# import re
# import asyncio
# import os
# from dotenv import load_dotenv
# from openai import AsyncOpenAI
# from openai import OpenAI
# from tqdm.asyncio import tqdm_asyncio
# import json



# MODEL_NAME = "qwen-1.5b-ru"


# JUDGE_NAME = "meta-llama/Llama-3.3-70B-Instruct"





# REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

# def get_judge_prompt(question, relevant_context, reference_answer, model_answer):
#     messages = []

#     messages.append({"role": "system", "content": "You are a helpful assistant that scores RAG answers against the ground truth. Always respond with a JSON object "
#                                                 "containing a 'comment' (string), 'is_inappropriate_refusal' (0-1), 'score' (number 1-5), 'is_correct' (0-1) fields."})

#     judge_prompt = f"""
#     **Question**: {question}
#     **Relevant context**: {relevant_context}
#     **Reference Answer**: {reference_answer}
#     **Model Answer**: {model_answer}

#     **Evaluation Rules**:
#     1. Scoring Scale:
#     - 5: Good answer (factually correct, complete)
#     - 4: Mostly correct with minor issues
#     - 3: Partially correct but has inaccuracies
#     - 2: Mostly incorrect but contains relevant elements
#     - 1: Completely wrong or irrelevant

#     2. Binary Correctness:
#     - 1 (Correct): Answer conveys the same meaning as reference (exact wording not required)
#     - 0 (Incorrect): Meaning differs from reference or contains false information

#     3. Binary Refuse:
#     - 1 (Inappropriate refusal): The model answer is a refusal, while the reference contains a full answer
#     - 0 (A normal answer or an appropriate refusal)

#     4. Key Principles:
#     - Focus on whether the model answers the question asked, not on whether it exactly matches the reference
#     - There is no need to lower the score if the model's answer omits some facts that are insignificant to the question
#     - If an answer seems redundant in relation to the reference, but the information is contained in a relevant context, then there is no need to lower the score for this
#     - Ignore stylistic differences if core meaning is preserved
#     - Inappropriate refusal should always have score 2 out of 5!

#     Provide analysis in JSON format: {{"comment": "...", "score": 1-5, "is_correct": 0|1, "is_inappropriate_refusal": 0|1}}"""

#     messages.append({"role": "user", "content": judge_prompt})

#     return messages

In [2]:
load_dotenv()
API_KEY = os.getenv("NEBIUS_API_KEY")

# client = OpenAI(
#     base_url="https://api.studio.nebius.com/v1/",
#     api_key=API_KEY
# )

client = AsyncOpenAI(
    base_url="https://api.studio.nebius.com/v1/",
    api_key=API_KEY
)

In [None]:
# with open("generation/qwen-1.5b-raw.json", "r") as f:
#     benchmark_data = json.load(f)
#     data=benchmark_data[1]

# messages = get_judge_prompt(
#     data['table_data']['Вопрос'],
#     data['table_data']['Отрывок из документа'],
#     data['table_data']['Ответ'],
#     data['model_answer']
# )

# response = client.chat.completions.create(
#     model="deepseek-ai/DeepSeek-V3",
#     messages=messages, 
#     temperature=0.5,
#     max_completion_tokens=512
# )

# print(response.choices[0].message.content)

{"comment": "The model answer does not accurately address the question about the percentage of difficult-to-extract oil in Russia and its locations. Instead, it provides unrelated information about the total reserves and types of oil in different regions, without mentioning the specific percentage of difficult-to-extract oil or its primary locations as stated in the reference answer.", "score": 1, "is_correct": 0, "is_inappropriate_refusal": 0}


In [3]:
async def run_llm(messages : str, model : str, semaphore : asyncio.Semaphore):
    async with semaphore:
        return await client.chat.completions.create(
            model=model,
            messages=messages, 
            temperature=0.5,
            max_completion_tokens=512
        )
    

async def process_task(task_id, messages, model, semaphore):
    try:
        response = await run_llm(
            messages=messages,
            model=model,
            semaphore=semaphore,
        )
        return {
            "task_id": task_id,
            "content": response.choices[0].message.content,
            "status": "success"
        }
    except Exception as e:
        return {
            "task_id": task_id,
            "error": str(e),
            "status": "failed"
        }

In [4]:
semaphore = asyncio.Semaphore(50)

with open(f"generation/{MODEL_NAME_1}.json", "r") as f:
    benchmark_data_1 = json.load(f)

with open(f"generation/{MODEL_NAME_2}.json", "r") as f:
    benchmark_data_2 = json.load(f)    

tasks = [process_task(
    task_id=i,
    messages=get_judge_prompt(benchmark_data_1[i]['table_data']['Вопрос'], benchmark_data_1[i]['table_data']['Отрывок из документа'], 
                              benchmark_data_1[i]['table_data']['Ответ'], benchmark_data_1[i]['model_answer'], benchmark_data_2[i]['model_answer']),
    model=JUDGE_NAME,
    semaphore=semaphore)
    for i in range(len(benchmark_data_1))
]

results = []
with tqdm_asyncio(total=len(tasks), desc="Processing LLM requests") as pbar:
    for coro in asyncio.as_completed(tasks):
        result = await coro
        results.append(result)
        pbar.update(1)

Processing LLM requests: 100%|██████████| 674/674 [00:54<00:00, 12.38it/s]


In [5]:
results.sort(key=lambda x: x['task_id'])

for result in results:
    if result['status'] == "failed":
        print(result['task_id'])

In [8]:
def parse(content):
    json_str = re.search(r'\{.*?\}', content, re.DOTALL)

    result = json.loads(json_str.group(0))

    keys = ['comment', 'winner']

    # keys = ['comment', 'score', 'is_correct', 'is_inappropriate_refusal']

    judgment = {}
    for key in keys:
        if key in result:
            judgment[key] = result[key]
        else:
            judgment[key] = None

    return judgment


for i, data in enumerate(benchmark_data_1):
    try:
        data["judgment"] = parse(results[i]['content'])
    except:
        print(data)

with open(f"scored_sbs/{MODEL_NAME_1}-vs-{MODEL_NAME_2}.json", "w") as f:
    json.dump(benchmark_data_1, f, ensure_ascii=False, indent=2)

{'table_data': {'Домен документов': 'Техническая документация', 'Сет документов': 'linux', 'Название документа': 'linux.pdf', 'Отрывок из документа': 'Таблица 3.2. Подкаталоги корневого каталога\n\nДиректория\tСодержимое\n/bin\tОбщие программы для совместного использования системой, системным администратором и пользователями.\n/boot\tЗагрузочные файлы и ядро, vmlinuz. В некоторых дистрибутивах также данные grub – загрузчика.\n/dev\tСодержит ссылки на все периферийные устройства, представленные файлами с особыми свойствами.\n/etc\tБольшинство важных системных файлов конфигурации, аналогично Панели Управления Windows.\n/home\tДомашние каталоги обычных пользователей.\n/initrd\tИнформация для загрузки (в некоторых дистрибутивах). Не удаляйте!\n/lib\tФайлы библиотек, необходимые системе и пользователям.\n/lost+found\tФайлы, спасенные во время сбоев (присутствует в каждом разделе).\n/misc\tДля разных целей.\n/mnt\tСтандартные точки монтирования для внешних файловых систем.\n/net\tСтандартные

In [None]:
# with open("scored/qwen-1.5b-raw.json", "r") as raw, \
#      open("scored/qwen-1.5b-baseline.json", "r") as baseline:
#     raw = json.load(raw)[0:20]
#     baseline = json.load(baseline)[0:20]

#     avg_raw = 0
#     avg_baseline = 0

#     for i in range(20):
#         print("-----------------------------------------------------------------\n")
#         print(f"Question: {raw[i]['table_data']['Вопрос']}\n")
#         print(f"Relevant context: {raw[i]['table_data']['Отрывок из документа']}\n")
#         print(f"Reference answer: {raw[i]['table_data']['Ответ']}\n")
#         print(f"RAW answer: {raw[i]['model_answer']}\n")
#         print(f"RAW judgment: {raw[i]['judgment']}\n")
#         print(f"BASELINE answer: {baseline[i]['model_answer']}\n")
#         print(f"BASELINE judgment: {baseline[i]['judgment']}\n")
#         print("-----------------------------------------------------------------\n")

#         avg_raw += raw[i]["judgment"]["score"]
#         avg_baseline += baseline[i]["judgment"]["score"]

#     print("\n\n\n")
#     print(f"RAW avg score: {avg_raw / 20}")
#     print(f"BASELINE avg score: {avg_baseline / 20}")

-----------------------------------------------------------------

Question: Какова доля трудноизвлекаемых запасов нефти в России и где расположены основные месторождения?

Relevant context: Характерной особенностью современной мировой нефтедобычи является
увеличение в структуре запасов доли трудноизвлекаемых нефтей (с вязкостью 30мПа и
выше). Во многих промышленно развитых странах мира тяжелая нефть рассматривается
в качестве основной базы развития нефтедобычи на ближайшие годы. Россия также
обладает значительными ресурсами трудноизвлекаемых запасов, их объем составляет
около 55 % от общих запасов российской нефти. Основные российские месторождения
высоковязкой нефти расположены в Пермской области, Татарстане, Башкирии и
Удмуртии. Наиболее крупные из них: Ван-Еганское, Северо-Комсомольское, Усинское,
Русское, Гремихинское и др., при этом более 2/3 всех запасов высоковязкой нефти
находятся на глубинах до 2000 м. На сегодня добыча высоковязкой нефти,
транспортировка ее к пунктам сбора и

In [7]:
with open("scored_simple/qwen-1.5b-baseline.json", "r") as baseline:
    baseline = json.load(baseline)

    avg_baseline = 0

    for i, item in enumerate(baseline):
        # print("-----------------------------------------------------------------\n")
        # print(f"Question: {raw[i]['table_data']['Вопрос']}\n")
        # print(f"Relevant context: {raw[i]['table_data']['Отрывок из документа']}\n")
        # print(f"Reference answer: {raw[i]['table_data']['Ответ']}\n")
        # print(f"RAW answer: {raw[i]['model_answer']}\n")
        # print(f"RAW judgment: {raw[i]['judgment']}\n")
        # print(f"BASELINE answer: {baseline[i]['model_answer']}\n")
        # print(f"BASELINE judgment: {baseline[i]['judgment']}\n")
        # print("-----------------------------------------------------------------\n")

        avg_baseline += item["judgment"]["score"]

    print("\n\n\n")
    print(f"BASELINE avg score: {avg_baseline / len(baseline)}")





BASELINE avg score: 2.8175074183976263


In [8]:
with open("scored_simple/qwen-1.5b-raw.json", "r") as baseline:
    baseline = json.load(baseline)

    avg_baseline = 0

    for i, item in enumerate(baseline):
        # print("-----------------------------------------------------------------\n")
        # print(f"Question: {raw[i]['table_data']['Вопрос']}\n")
        # print(f"Relevant context: {raw[i]['table_data']['Отрывок из документа']}\n")
        # print(f"Reference answer: {raw[i]['table_data']['Ответ']}\n")
        # print(f"RAW answer: {raw[i]['model_answer']}\n")
        # print(f"RAW judgment: {raw[i]['judgment']}\n")
        # print(f"BASELINE answer: {baseline[i]['model_answer']}\n")
        # print(f"BASELINE judgment: {baseline[i]['judgment']}\n")
        # print("-----------------------------------------------------------------\n")

        avg_baseline += item["judgment"]["score"]

    print("\n\n\n")
    print(f"BASELINE avg score: {avg_baseline / len(baseline)}")





BASELINE avg score: 2.986646884272997


In [11]:
with open("scored_sbs/qwen-3b-raw-vs-qwen-3b-raft.json", "r") as f:
    data = json.load(f)

    even = 0
    raw = 0
    baseline = 0


    for i, item in enumerate(data):
        # print("-----------------------------------------------------------------\n")
        # print(f"Question: {raw[i]['table_data']['Вопрос']}\n")
        # print(f"Relevant context: {raw[i]['table_data']['Отрывок из документа']}\n")
        # print(f"Reference answer: {raw[i]['table_data']['Ответ']}\n")
        # print(f"RAW answer: {raw[i]['model_answer']}\n")
        # print(f"RAW judgment: {raw[i]['judgment']}\n")
        # print(f"BASELINE answer: {baseline[i]['model_answer']}\n")
        # print(f"BASELINE judgment: {baseline[i]['judgment']}\n")
        # print("-----------------------------------------------------------------\n")

        try:
            winner = item["judgment"]["winner"]
        except:
            winner = 2
        if winner == 0:
            even+=1
        elif winner == 1:
            raw+=1
        else:
            baseline+=1

    print("\n\n\n")
    print(f"even: {even / len(data)}")
    print(f"raw: {raw / len(data)}")
    print(f"baseline: {baseline / len(data)}")





even: 0.032640949554896145
raw: 0.5504451038575667
baseline: 0.4169139465875371
