### Добавим и оценим какую-нибудь "новую" LLM. На примере Qwen/Qwen2-7B-Instruct

In [1]:
!cd ../

### Загрузим токенайзер

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-7B-Instruct')

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Посмотрим на чат темплейт

In [3]:
print(tokenizer.chat_template)

{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
You are a helpful assistant.<|im_end|>
' }}{% endif %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


### Посмотрим интересные нам спец токены

In [4]:
tokenizer.all_special_tokens, tokenizer.bos_token_id, tokenizer.eos_token, tokenizer.eos_token_id

(['<|im_end|>', '<|endoftext|>', '<|im_start|>'], None, '<|im_end|>', 151645)

### Посмотрим на примеры

In [5]:
prompt1 = tokenizer.apply_chat_template(
    [
        {'role': 'system', 'content': 'Ты классный чат-бот.'},
        {'role': 'user', 'content': 'Сколько будет 2+2?'},
        {'role': 'assistant', 'content': '4!'}
    ], tokenize=False, add_generation_prompt=False
)
print(prompt1)

<|im_start|>system
Ты классный чат-бот.<|im_end|>
<|im_start|>user
Сколько будет 2+2?<|im_end|>
<|im_start|>assistant
4!<|im_end|>



In [6]:
prompt2 = tokenizer.apply_chat_template(
    [
        {'role': 'user', 'content': 'Сколько будет 2+2?'},
        {'role': 'assistant', 'content': '4!'}
    ], tokenize=False, add_generation_prompt=False
)
print(prompt2)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Сколько будет 2+2?<|im_end|>
<|im_start|>assistant
4!<|im_end|>



In [7]:
prompt3 = tokenizer.apply_chat_template(
    [
        {'role': 'user', 'content': 'Сколько будет 2+2?'}
    ], tokenize=False, add_generation_prompt=True
)
print(prompt3)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Сколько будет 2+2?<|im_end|>
<|im_start|>assistant



### Теперь составим наш json конфиг

In [8]:
conv_config = {
    "system_prompt": "You are a helpful assistant.",
    "system_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
    "user_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
    "bot_message_template": "<|im_start|>{role}\n{content}<|im_end|>\n",
    "bot_message_template_incomplete": "<|im_start|>{role}\n{content}",
    "user_role": "user",
    "bot_role": "assistant",
    "system_role": "system",
    "global_prefix": "", # как мы видем он именно пустой.
    "suffix": "<|im_start|>assistant\n", # добавляется по аналогии с add_generation_prompt=True, если последнее сообщение не bot
    "add_special_tokens": False, # почти всегда False. 
    "eos_token": "<|im_end|>" # основной критерий остановки генерации
}


In [9]:
import json
import codecs

conv_config_path = 'examples/qwen2_instruct.json'
with codecs.open(conv_config_path, 'w', 'utf-8') as file:
    json.dump(conv_config, file, ensure_ascii=False, indent=4)

In [10]:
from llmtf.conversation import Conversation

conversation = Conversation.from_template(conv_config_path)
conversation.add_system_message('Ты классный чат-бот.')
conversation.add_user_message('Сколько будет 2+2?')
conversation.add_bot_message('4!')
prompt1_our = conversation.get_prompt(add_suffix=False)

assert prompt1_our == prompt1
print(prompt1_our)

<|im_start|>system
Ты классный чат-бот.<|im_end|>
<|im_start|>user
Сколько будет 2+2?<|im_end|>
<|im_start|>assistant
4!<|im_end|>



In [11]:
conversation = Conversation.from_template(conv_config_path)
conversation.add_user_message('Сколько будет 2+2?')
conversation.add_bot_message('4!')
prompt2_our = conversation.get_prompt(add_suffix=False)

assert prompt2_our == prompt2
print(prompt2_our)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Сколько будет 2+2?<|im_end|>
<|im_start|>assistant
4!<|im_end|>



In [12]:
conversation = Conversation.from_template(conv_config_path)
conversation.add_user_message('Сколько будет 2+2?')
prompt3_our = conversation.get_prompt(add_suffix=True)

assert prompt3_our == prompt3
print(prompt3_our)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Сколько будет 2+2?<|im_end|>
<|im_start|>assistant



### Отлично, conversation config готов и корректен. (Всегда проверяйте!). Теперь посчитаем какие-нибудь датасеты (часть сэмплов)

#### Инициализируем Evaluator

In [13]:
from llmtf.evaluator import Evaluator

evaluator = Evaluator()

#### Загружаем модель с полученным конфигом

In [14]:
from llmtf.model import VLLMModel

model_name_or_path = 'Qwen/Qwen2-7B-Instruct'
model = VLLMModel(conv_config_path, device_map='cuda:0', disable_sliding_window=True, enable_prefix_caching=True)
model.from_pretrained(model_name_or_path)

INFO 06-10 17:33:28 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='Qwen/Qwen2-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=Qwen/Qwen2-7B-Instruct)




INFO 06-10 17:33:29 selector.py:51] Using XFormers backend.
INFO 06-10 17:33:31 selector.py:51] Using XFormers backend.
INFO 06-10 17:33:31 weight_utils.py:207] Using model weights format ['*.safetensors']
INFO 06-10 17:33:34 model_runner.py:146] Loading model weights took 14.2487 GB
INFO 06-10 17:33:36 gpu_executor.py:83] # GPU blocks: 60948, # CPU blocks: 4681
INFO 06-10 17:33:39 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-10 17:33:39 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-10 17:33:48 model_runner.py:924] Graph capturing finished in 8 secs.
INFO 06-10 17:33:48 block_manager_v1.py:2

INFO: 2024-06-10 17:33:48,900: llmtf.base.llm: Override eos_token_id in generation_config from 151645 to 151645
INFO: 2024-06-10 17:33:48,901: llmtf.base.llm: Model id: Qwen/Qwen2-7B-Instruct
INFO: 2024-06-10 17:33:48,923: llmtf.base.llm: global_prefix = 
INFO: 2024-06-10 17:33:48,924: llmtf.base.llm: vllm_adds_bos = False
INFO: 2024-06-10 17:33:48,925: llmtf.base.llm: Resetting generation_config.stop_strings to []
INFO: 2024-06-10 17:33:48,925: llmtf.base.llm: Leading space: False


In [15]:
output_dir = 'examples/example_qwen2_7b_instruct_rucola_custom_eval'
datasets_names = ['russiannlp/rucola_custom']
evaluator.evaluate(model, output_dir, datasets_names=datasets_names, max_len=4000, few_shot_count=5, batch_size=8, max_sample_per_dataset=200)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 175.95it/s]
INFO: 2024-06-10 17:33:52,961: llmtf.base.russiannlp/rucola_custom: Loading Dataset: 4.03s
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:06<00:00,  3.98it/s]
INFO: 2024-06-10 17:33:59,243: llmtf.base.russiannlp/rucola_custom: Processing Dataset: 6.28s
INFO: 2024-06-10 17:33:59,244: llmtf.base.russiannlp/rucola_custom: Results for russiannlp/rucola_custom:
INFO: 2024-06-10 17:33:59,248: llmtf.base.llm: Resetting generation_config.stop_strings to []
INFO: 2024-06-10 17:33:59,249: llmtf.base.russiannlp/rucola_custom: {'acc': 0.725, 'mcc': 0.24077390894881104}
