In [1]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


Token is valid (permission: fineGrained).
The token `LLaMa` has been saved to /home/user/.cache/huggingface/stored_tokens
Your token has been saved to /home/user/.cache/huggingface/token
Login successful.
The current active token is: `LLaMa`


In [None]:
from collections import defaultdict
import datasets
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM
from peft import PeftModel


MODEL_NAME = "llama-1b-lora"
MODEL_PATH = "meta-llama/Llama-3.2-1B-Instruct"
# MODEL_PATH = "unsloth/Llama-3.2-1B-Instruct"
BATCH_SIZE = 128

# SYSTEM = [{"role": "system", "content": ""}]

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

BOS_TOKEN = tokenizer.bos_token if tokenizer.bos_token else tokenizer.additional_special_tokens[0] 
print(f"BOS_TOKEN={BOS_TOKEN}")

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_PATH,  # Путь к объединенной модели
#     device_map="auto",
# )
model = AutoModelForCausalLM.from_pretrained(
    "../../llama-v100-bs_12_2/merged_model",  # Путь к объединенной модели
    device_map="auto",
)
model.eval()




# subjects = ["high_school_biology"]

subjects = ["abstract_algebra", "anatomy", "astronomy", "business_ethics", "clinical_knowledge",
             "college_biology", "college_chemistry", "college_computer_science", "college_mathematics",
               "college_medicine", "college_physics", "computer_security", "conceptual_physics", "econometrics",
                 "electrical_engineering", "elementary_mathematics", "formal_logic", "global_facts",
                   "high_school_biology", "high_school_chemistry", "high_school_computer_science",
                     "high_school_european_history", "high_school_geography", "high_school_government_and_politics",
                       "high_school_macroeconomics", "high_school_mathematics", "high_school_microeconomics",
                         "high_school_physics", "high_school_psychology", "high_school_statistics", "high_school_us_history",
                           "high_school_world_history", "human_aging", "human_sexuality", "international_law",
                             "jurisprudence", "logical_fallacies", "machine_learning", "management", "marketing",
                               "medical_genetics", "miscellaneous", "moral_disputes", "moral_scenarios", "nutrition",
                                 "philosophy", "prehistory", "professional_accounting", "professional_law",
                                   "professional_medicine", "professional_psychology", "public_relations",
                                     "security_studies", "sociology", "us_foreign_policy", "virology", "world_religions"]


all_datasets = {subject: datasets.load_dataset("NLPCoreTeam/mmlu_ru", name=subject, split="test") for subject in subjects}

test_dfs = []
for subject in subjects:
    dataset = all_datasets[subject]
    df = dataset.to_pandas()
    int2str = dataset.features['answer'].int2str
    df['answer'] = df['answer'].map(int2str)
    df.insert(0, 'subject_en', subject)
    test_dfs.append(df)

test_df = pd.concat(test_dfs).reset_index(drop=True)


def create_prompt(row):
    return (
        f"Дан вопрос по теме {row['subject_en']}: {row['question_ru']}. Варианты ответа:\n"
        f"A) {row['choices_ru'][0]}\nB) {row['choices_ru'][1]}\nC) {row['choices_ru'][2]}\nD) {row['choices_ru'][3]}\n"
        "Твой ответ должен быть в формате 'Ответ: <Буква>'.\n"
        "Закончи ответ, указав только одну букву: A, B, C или D.\n"
    )

def generate_conversation(row):
    formatted_message = [
        {"role": "user", "content": create_prompt(row)},
        # {"role": "assistant", "content": "aboba"},
    ]
    return formatted_message

def extract_answer(text):
    text = text.upper().strip()

    explicit_pattern = re.search(
        r"(?:Ответ|ANSWER|Правильный ответ|Answer)[\s:\-—]*([A-D])", 
        text
    )
    if explicit_pattern:
        return explicit_pattern.group(1)

    for char in text:
        if char in {'A','B','C','D'}:
            return char
    
    return ""


def evaluate_test(df, model, tokenizer):
    device = model.device
    df['prediction'] = ''
    
    for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Processing batches"):
        batch = df.iloc[i:i+BATCH_SIZE]
        prompts = [generate_conversation(row) for _, row in batch.iterrows()]
        # print(chat_prompts[0])

        chat_prompts = tokenizer.apply_chat_template(
            prompts,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # print(chat_prompts[0])
        # break
        # chat_prompts = [prompt + f"{tokenizer.convert_ids_to_tokens(128006)}assistant{tokenizer.convert_ids_to_tokens(128007)}\n" for prompt in chat_prompts]
        # print(chat_prompts[0])
        # break
        # print(chat_prompts)
        
        inputs = tokenizer(
            chat_prompts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=30,
                do_sample=False,
                temperature=None,
                top_p=None,
                top_k=None,
            )
        
        decoded = tokenizer.batch_decode(
            outputs[:, inputs.input_ids.shape[1]:], 
            skip_special_tokens=True,
        )
        
        for j, text in enumerate(decoded):
            # print(f"---------------------------------------------------\n{text.strip()}\n---------------------------------------------------")
            answer = extract_answer(text)
            df.at[i+j, 'prediction'] = answer
            
        # break

    df['correct'] = df['answer'] == df['prediction']
    total_acc = df['correct'].mean()
    subject_acc = df.groupby('subject_en')['correct'].mean()


    return total_acc, subject_acc


print("Starting evaluation...")
total_accuracy, subject_accuracy = evaluate_test(test_df, model, tokenizer)


print(f"\n{'='*40}\nTest Results\n{'='*40}")
print(f"Total Accuracy: {total_accuracy:.2%}")
print("\nAccuracy by Subject:")
print(subject_accuracy.sort_values(ascending=False).to_string(float_format="{:,.2%}".format))


test_df.to_csv(f"../result/mmlu_{MODEL_NAME}.csv", index=False)

2025-05-08 23:20:13.678510: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746735613.699570  164565 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746735613.705610  164565 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746735613.721033  164565 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746735613.721050  164565 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746735613.721052  164565 computation_placer.cc:177] computation placer alr

BOS_TOKEN=<|begin_of_text|>
Starting evaluation...


Processing batches:   0%|          | 0/110 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   1%|          | 1/110 [00:08<14:42,  8.10s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   2%|▏         | 2/110 [00:16<14:56,  8.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   3%|▎         | 3/110 [00:26<16:03,  9.01s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   4%|▎         | 4/110 [00:35<16:05,  9.11s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   5%|▍         | 5/110 [00:41<13:51,  7.92s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   5%|▌         | 6/110 [00:49<13:50,  7.99s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Processing batches:   6%|▋  


Test Results
Total Accuracy: 32.55%

Accuracy by Subject:
subject_en
us_foreign_policy                     59.00%
international_law                     57.85%
marketing                             49.57%
jurisprudence                         45.37%
sociology                             44.78%
security_studies                      44.08%
philosophy                            43.09%
nutrition                             42.81%
astronomy                             42.76%
human_sexuality                       42.75%
electrical_engineering                42.07%
public_relations                      41.82%
high_school_computer_science          41.00%
business_ethics                       41.00%
high_school_biology                   40.32%
virology                              39.76%
anatomy                               39.26%
clinical_knowledge                    38.11%
computer_security                     38.00%
miscellaneous                         37.68%
human_aging                   