In [1]:
from collections import defaultdict
import datasets
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM
from peft import PeftModel


MODEL_NAME = "qwen-1.5b-webglm"
MODEL_PATH = "Qwen/Qwen2.5-1.5B-Instruct"
BATCH_SIZE = 128

SYSTEM = [{"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}]

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side='left')

BOS_TOKEN = tokenizer.bos_token if tokenizer.bos_token else tokenizer.additional_special_tokens[0] 
print(f"BOS_TOKEN={BOS_TOKEN}")

model = AutoModelForCausalLM.from_pretrained(
    "../../qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",  # Путь к объединенной модели
    device_map="auto",
)
model.eval()




# subjects = ["high_school_biology"]

subjects = ["abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
             "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
               "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics",
                 "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts",
                   "high_school_biology", "high_school_chemistry", "high_school_computer_science",
                     "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
                       "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
                         "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history",
                           "high_school_world_history", "human_aging", "human_sexuality", "international_law",
                             "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing",
                               "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition",
                                 "philosophy", "prehistory", "professional_accounting", "professional_law",
                                   "professional_medicine", "professional_psychology", "public_relations",
                                     "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"]


all_datasets = {subject: datasets.load_dataset("NLPCoreTeam/mmlu_ru", name=subject, split="test") for subject in subjects}

test_dfs = []
for subject in subjects:
    dataset = all_datasets[subject]
    df = dataset.to_pandas()
    int2str = dataset.features['answer'].int2str
    df['answer'] = df['answer'].map(int2str)
    df.insert(0, 'subject_en', subject)
    test_dfs.append(df)

test_df = pd.concat(test_dfs).reset_index(drop=True)


def create_prompt(row):
    return (
        f"Дан вопрос по теме {row['subject_en']}: {row['question_ru']}. Варианты ответа:\n"
        f"A) {row['choices_ru'][0]}\nB) {row['choices_ru'][1]}\nC) {row['choices_ru'][2]}\nD) {row['choices_ru'][3]}\n"
        "Твой ответ должен быть в формате 'Ответ: <Буква>'.\n"
        "Закончи ответ, указав только одну букву: A, B, C или D.\n"
    )

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": create_prompt(row)},
    ]
    return formatted_message

def extract_answer(text):
    text = text.upper().strip()

    explicit_pattern = re.search(
        r"(?:Ответ|ANSWER|Правильный ответ|Answer)[\s:\-—]*([A-D])", 
        text
    )
    if explicit_pattern:
        return explicit_pattern.group(1)

    for char in text:
        if char in {'A','B','C','D'}:
            return char
    
    return ""


def evaluate_test(df, model, tokenizer):
    device = model.device
    df['prediction'] = ''
    
    for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
        batch = df.iloc[i:i+BATCH_SIZE]
        prompts = [generate_conversation(row) for _, row in batch.iterrows()]
        chat_prompts = tokenizer.apply_chat_template(
            prompts,
            tokenize=False,
            add_generation_prompt=True
        )
        # chat_prompts = [prompt + f"{BOS_TOKEN}assistant\n" for prompt in chat_prompts]
        
        inputs = tokenizer(
            chat_prompts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=30,
                do_sample=False,
                temperature=None,
                top_p=None,
                top_k=None
            )
        
        decoded = tokenizer.batch_decode(
            outputs[:, inputs.input_ids.shape[1]:], 
            skip_special_tokens=True,
        )
        
        for j, text in enumerate(decoded):
            # print(f"---------------------------------------------------\n{text.strip()}\n---------------------------------------------------")
            answer = extract_answer(text)
            df.at[i+j, 'prediction'] = answer

    df['correct'] = df['answer'] == df['prediction']
    total_acc = df['correct'].mean()
    subject_acc = df.groupby('subject_en')['correct'].mean()


    return total_acc, subject_acc


print("Starting evaluation...")
total_accuracy, subject_accuracy = evaluate_test(test_df, model, tokenizer)


print(f"\n{'='*40}\nTest Results\n{'='*40}")
print(f"Total Accuracy: {total_accuracy:.2%}")
print("\nAccuracy by Subject:")
print(subject_accuracy.sort_values(ascending=False).to_string(float_format="{:,.2%}".format))


test_df.to_csv(f"../result/mmlu_{MODEL_NAME}.csv", index=False)

2025-05-09 18:22:37.369463: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746804157.392635   67996 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746804157.399535   67996 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746804157.417617   67996 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746804157.417636   67996 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746804157.417639   67996 computation_placer.cc:177] computation placer alr

BOS_TOKEN=<|im_start|>


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Starting evaluation...


Processing batches: 100%|██████████| 110/110 [21:19<00:00, 11.63s/it]



Test Results
Total Accuracy: 41.65%

Accuracy by Subject:
subject_en
marketing                             69.23%
us_foreign_policy                     68.00%
international_law                     67.77%
logical_fallacies                     60.74%
sociology                             60.20%
management                            60.19%
jurisprudence                         58.33%
computer_security                     57.00%
medical_genetics                      57.00%
high_school_psychology                56.15%
business_ethics                       56.00%
miscellaneous                         52.49%
clinical_knowledge                    52.45%
astronomy                             51.32%
human_aging                           51.12%
electrical_engineering                51.03%
philosophy                            50.80%
high_school_computer_science          50.00%
college_computer_science              50.00%
public_relations                      48.18%
nutrition                     