In [None]:
from collections import defaultdict
import datasets
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM
from peft import PeftModel


MODEL_NAME = "qwen-1.5b-raw-eng"
MODEL_PATH = "Qwen/Qwen2.5-1.5B-Instruct"
BATCH_SIZE = 128

SYSTEM = []#[{"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}]

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

BOS_TOKEN = tokenizer.bos_token if tokenizer.bos_token else tokenizer.additional_special_tokens[0] 
print(f"BOS_TOKEN={BOS_TOKEN}")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,  # Путь к объединенной модели
    device_map="auto",
)
model.eval()




# subjects = ["high_school_biology"]

subjects = ["abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
             "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
               "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics",
                 "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts",
                   "high_school_biology", "high_school_chemistry", "high_school_computer_science",
                     "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
                       "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
                         "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history",
                           "high_school_world_history", "human_aging", "human_sexuality", "international_law",
                             "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing",
                               "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition",
                                 "philosophy", "prehistory", "professional_accounting", "professional_law",
                                   "professional_medicine", "professional_psychology", "public_relations",
                                     "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"]


all_datasets = {subject: datasets.load_dataset("NLPCoreTeam/mmlu_ru", name=subject, split="test") for subject in subjects}

test_dfs = []
for subject in subjects:
    dataset = all_datasets[subject]
    df = dataset.to_pandas()
    int2str = dataset.features['answer'].int2str
    df['answer'] = df['answer'].map(int2str)
    df.insert(0, 'subject_en', subject)
    test_dfs.append(df)

test_df = pd.concat(test_dfs).reset_index(drop=True)


def create_prompt(row):
    return (
        f"The question is on topic {row['subject_en']}: {row['question_en']}. Answer options:\n"
        f"A) {row['choices_en'][0]}\nB) {row['choices_en'][1]}\nC) {row['choices_en'][2]}\nD) {row['choices_en'][3]}\n"
        "Your answer must be in the format 'Answer: <Letter>'.\n"
        "Complete the answer using only one letter: A, B, C, or D.\n"
    )

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": create_prompt(row)},
    ]
    return formatted_message

def extract_answer(text):
    text = text.upper().strip()

    explicit_pattern = re.search(
        r"(?:Ответ|ANSWER|Правильный ответ|Answer)[\s:\-—]*([A-D])", 
        text
    )
    if explicit_pattern:
        return explicit_pattern.group(1)

    for char in text:
        if char in {'A','B','C','D'}:
            return char
    
    return ""


def evaluate_test(df, model, tokenizer):
    device = model.device
    df['prediction'] = ''
    
    for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
        batch = df.iloc[i:i+BATCH_SIZE]
        prompts = [generate_conversation(row) for _, row in batch.iterrows()]
        chat_prompts = tokenizer.apply_chat_template(
            prompts,
            tokenize=False,
        )
        chat_prompts = [prompt + f"{BOS_TOKEN}assistant\n" for prompt in chat_prompts]
        
        inputs = tokenizer(
            chat_prompts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=30,
                do_sample=False,
                temperature=None,
                top_p=None,
                top_k=None
            )
        
        decoded = tokenizer.batch_decode(
            outputs[:, inputs.input_ids.shape[1]:], 
            skip_special_tokens=True,
        )
        
        for j, text in enumerate(decoded):
            # print(f"---------------------------------------------------\n{text.strip()}\n---------------------------------------------------")
            answer = extract_answer(text)
            df.at[i+j, 'prediction'] = answer

    df['correct'] = df['answer'] == df['prediction']
    total_acc = df['correct'].mean()
    subject_acc = df.groupby('subject_en')['correct'].mean()


    return total_acc, subject_acc


print("Starting evaluation...")
total_accuracy, subject_accuracy = evaluate_test(test_df, model, tokenizer)


print(f"\n{'='*40}\nTest Results\n{'='*40}")
print(f"Total Accuracy: {total_accuracy:.2%}")
print("\nAccuracy by Subject:")
print(subject_accuracy.sort_values(ascending=False).to_string(float_format="{:,.2%}".format))


test_df.to_csv(f"../result/mmlu_{MODEL_NAME}.csv", index=False)

2025-05-09 12:57:00.538920: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746784620.628599    3319 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746784620.654000    3319 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746784620.848710    3319 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746784620.848725    3319 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746784620.848728    3319 computation_placer.cc:177] computation placer alr

BOS_TOKEN=<|im_start|>


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Starting evaluation...


Processing batches:   0%|          | 0/110 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing batches:   1%|          | 1/110 [00:07<13:46,  7.58s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing batches:   2%|▏         | 2/110 [00:16<14:33,  8.08s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing batches:   3%|▎         | 3/110 [00:24<14:43,  8.25s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing batches:   4%|▎         | 4/110 [00:3


Test Results
Total Accuracy: 51.15%

Accuracy by Subject:
subject_en
marketing                             78.21%
sociology                             76.12%
us_foreign_policy                     76.00%
high_school_psychology                75.05%
logical_fallacies                     74.85%
high_school_geography                 74.24%
computer_security                     74.00%
management                            73.79%
world_religions                       73.10%
international_law                     72.73%
high_school_government_and_politics   70.98%
miscellaneous                         68.33%
jurisprudence                         67.59%
high_school_microeconomics            65.55%
high_school_biology                   64.52%
clinical_knowledge                    64.15%
medical_genetics                      64.00%
business_ethics                       63.00%
prehistory                            62.65%
nutrition                             62.09%
philosophy                    