In [84]:
import re
import asyncio
import os
from dotenv import load_dotenv
from openai import AsyncOpenAI
from openai import OpenAI
from tqdm.asyncio import tqdm_asyncio
import json



MODEL_NAME = "qwen-3b-raft"

shuffle = ""

type = "large"



JUDGE_NAME = "meta-llama/Llama-3.3-70B-Instruct"





REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_judge_prompt(question, relevant_context, reference_answer, model_answer):
    messages = []

    messages.append({"role": "system", "content": "You are a helpful assistant that scores RAG answers against the ground truth. Always respond with a JSON object "
                                                "containing a 'comment' (string), 'is_inappropriate_refusal' (0-1), 'score' (number 1-5), 'is_correct' (0-1) fields."})

    judge_prompt = f"""
    **Question**: {question}
    **Relevant context**: {relevant_context}
    **Reference Answer**: {reference_answer}
    **Model Answer**: {model_answer}

    **Evaluation Rules**:
    1. Scoring Scale:
    - 5: Good answer (factually correct, complete)
    - 4: Mostly correct with minor issues
    - 3: Partially correct but has inaccuracies
    - 2: Mostly incorrect but contains relevant elements
    - 1: Completely wrong or irrelevant

    2. Binary Correctness:
    - 1 (Correct): Answer conveys the same meaning as reference (exact wording not required)
    - 0 (Incorrect): Meaning differs from reference or contains false information

    3. Binary Refuse:
    - 1 (Inappropriate refusal): The model answer is a refusal, while the reference contains a full answer
    - 0 (A normal answer or an appropriate refusal)

    4. Key Principles:
    - Focus on whether the model answers the question asked, not on whether it exactly matches the reference
    - There is no need to lower the score if the model's answer omits some facts that are insignificant to the question
    - If an answer seems redundant in relation to the reference, but the information is contained in a relevant context, then there is no need to lower the score for this
    - Ignore stylistic differences if core meaning is preserved
    - Inappropriate refusal should always have score 2 out of 5!

    Provide analysis in JSON format: {{"comment": "...", "is_inappropriate_refusal": 0|1, "score": 1-5, "is_correct": 0|1}}"""

    messages.append({"role": "user", "content": judge_prompt})

    return messages

In [85]:
load_dotenv()
API_KEY = os.getenv("NEBIUS_API_KEY")

# client = OpenAI(
#     base_url="https://api.studio.nebius.com/v1/",
#     api_key=API_KEY
# )

client = AsyncOpenAI(
    base_url="https://api.studio.nebius.com/v1/",
    api_key=API_KEY
)

In [None]:
# with open("generation/qwen-1.5b-raw.json", "r") as f:
#     benchmark_data = json.load(f)
#     data=benchmark_data[1]

# messages = get_judge_prompt(
#     data['table_data']['Вопрос'],
#     data['table_data']['Отрывок из документа'],
#     data['table_data']['Ответ'],
#     data['model_answer']
# )

# response = client.chat.completions.create(
#     model="deepseek-ai/DeepSeek-V3",
#     messages=messages, 
#     temperature=0.5,
#     max_completion_tokens=512
# )

# print(response.choices[0].message.content)

{"comment": "The model answer does not accurately address the question about the percentage of difficult-to-extract oil in Russia and its locations. Instead, it provides unrelated information about the total reserves and types of oil in different regions, without mentioning the specific percentage of difficult-to-extract oil or its primary locations as stated in the reference answer.", "score": 1, "is_correct": 0, "is_inappropriate_refusal": 0}


In [86]:
async def run_llm(messages : str, model : str, semaphore : asyncio.Semaphore):
    async with semaphore:
        return await client.chat.completions.create(
            model=model,
            messages=messages, 
            temperature=0.5,
            max_completion_tokens=512
        )
    

async def process_task(task_id, messages, model, semaphore):
    try:
        response = await run_llm(
            messages=messages,
            model=model,
            semaphore=semaphore,
        )
        return {
            "task_id": task_id,
            "content": response.choices[0].message.content,
            "status": "success"
        }
    except Exception as e:
        return {
            "task_id": task_id,
            "error": str(e),
            "status": "failed"
        }

In [87]:
semaphore = asyncio.Semaphore(50)

with open(f"generation_short{shuffle}/{MODEL_NAME}_{type}.json", "r") as f:
    benchmark_data = json.load(f)

tasks = [process_task(
    task_id=i,
    messages=get_judge_prompt(data['table_data']['Вопрос'], data['table_data']['Отрывок из документа'], data['table_data']['Ответ'], data['model_answer']),
    model=JUDGE_NAME,
    semaphore=semaphore)
    for i, data in enumerate(benchmark_data)
]

results = []
with tqdm_asyncio(total=len(tasks), desc="Processing LLM requests") as pbar:
    for coro in asyncio.as_completed(tasks):
        result = await coro
        results.append(result)
        pbar.update(1)

Processing LLM requests: 100%|██████████| 231/231 [00:25<00:00,  9.08it/s]


In [88]:
results.sort(key=lambda x: x['task_id'])

for result in results:
    if result['status'] == "failed":
        print(result['task_id'])

In [89]:
def parse(content):
    json_str = re.search(r'\{.*?\}', content, re.DOTALL)

    result = json.loads(json_str.group(0))

    # keys = ['comment', 'score']

    keys = ['comment', 'is_inappropriate_refusal', 'score', 'is_correct']

    judgment = {}
    for key in keys:
        if key in result:
            judgment[key] = result[key]
        else:
            judgment[key] = None

    return judgment


for i, data in enumerate(benchmark_data):
    data["judgment"] = parse(results[i]['content'])

with open(f"scored_short{shuffle}/{MODEL_NAME}_{type}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

In [91]:
with open(f"scored_short{shuffle}/{MODEL_NAME}_{type}.json", "r") as baseline:
    baseline = json.load(baseline)

    avg_baseline = 0
    avg_accuracy = 0

    for i, item in enumerate(baseline):
        # print("-----------------------------------------------------------------\n")
        # print(f"Question: {raw[i]['table_data']['Вопрос']}\n")
        # print(f"Relevant context: {raw[i]['table_data']['Отрывок из документа']}\n")
        # print(f"Reference answer: {raw[i]['table_data']['Ответ']}\n")
        # print(f"RAW answer: {raw[i]['model_answer']}\n")
        # print(f"RAW judgment: {raw[i]['judgment']}\n")
        # print(f"BASELINE answer: {baseline[i]['model_answer']}\n")
        # print(f"BASELINE judgment: {baseline[i]['judgment']}\n")
        # print("-----------------------------------------------------------------\n")

        avg_baseline += item["judgment"]["score"]
        avg_accuracy += item["judgment"]["is_correct"]

    print("\n\n\n")
    print(f"BASELINE avg score: {avg_baseline / 231}")
    print(f"BASELINE avg accuracy: {avg_accuracy / 231}")





BASELINE avg score: 4.125541125541125
BASELINE avg accuracy: 0.7532467532467533


In [None]:
with open("scored_random/qwen-3b-raw.json", "r") as baseline:
    print(len(json.load(baseline)))

96


In [104]:
with open(f"scored_short_random/qwen-3b-raw_small.json", "r") as baseline:
    baseline = json.load(baseline)

    avg_baseline = 0
    avg_accuracy = 0

    for i, item in enumerate(baseline):
        # print("-----------------------------------------------------------------\n")
        # print(f"Question: {raw[i]['table_data']['Вопрос']}\n")
        # print(f"Relevant context: {raw[i]['table_data']['Отрывок из документа']}\n")
        # print(f"Reference answer: {raw[i]['table_data']['Ответ']}\n")
        # print(f"RAW answer: {raw[i]['model_answer']}\n")
        # print(f"RAW judgment: {raw[i]['judgment']}\n")
        # print(f"BASELINE answer: {baseline[i]['model_answer']}\n")
        # print(f"BASELINE judgment: {baseline[i]['judgment']}\n")
        # print("-----------------------------------------------------------------\n")

        avg_baseline += item["judgment"]["score"]
        avg_accuracy += item["judgment"]["is_correct"]

    print("\n\n\n")
    print(f"BASELINE avg score: {avg_baseline / 231}")
    print(f"BASELINE avg accuracy: {avg_accuracy / 231}")





BASELINE avg score: 4.034632034632034
BASELINE avg accuracy: 0.6883116883116883
