# QWEN

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "qwen-1.5b-raft"



BATCH_SIZE = 16

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model_500",
    device_map="auto",)
model.eval()


# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай полный и краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-20 21:28:56.848310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747765736.872240   87473 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747765736.879711   87473 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747765736.899633   87473 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747765736.899654   87473 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747765736.899657   87473 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

100%|██████████| 43/43 [1:47:02<00:00, 149.36s/it]


# LLAMA

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "llama-1b-raw"



BATCH_SIZE = 16

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    device_map="auto",)
model.eval()

# "meta-llama/Llama-3.2-1B-Instruct",
# "../Alex/llama-v100-bs_12_2/merged_model",
# "../Alex/llama-v100-bs_12_2-webglm_ft/merged_model",



REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай полный и краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ты уверен, что ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-14 13:38:23.458442: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747219103.482478   13811 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747219103.488781   13811 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747219103.508068   13811 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747219103.508084   13811 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747219103.508087   13811 computation_placer.cc:177] computation placer alr

In [2]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )

    # for chat_prompt in chat_prompts:
    #     print(len(tokenizer.encode(chat_prompt, add_special_tokens=False)))
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None,
            pad_token_id = tokenizer.eos_token_id
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        print(batch[j]['table_data']['Вопрос'])
        print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        print(batch[j]['table_data']['Ответ'])
        print()

        item["model_answer"] = decoded[j]

    assert(1 == 0)


# with open(f"generation/{MODEL_NAME}.json", "w") as f:
#     json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

  0%|          | 0/43 [01:21<?, ?it/s]

Какова доля трудноизвлекаемых запасов нефти в России и где расположены основные месторождения?
---------------------------------------------------
Доля трудноизвлекаемых запасов нефти в России составляет около 55% от общих запасов российской нефти. Основные российские месторождения высоковязкой нефти расположены в Пермской области, Татарстане, Башкирии и Удмуртии. Наиболее крупные из них: Ван Еганское, Северо Комсомольское, Усинское, —Русское, Гремихинское и др. На сегодня добыча высоковязкой нефти, транспортировка ее к пунктам сбора и подготовке и переработка с целью получения конечных продуктов — одна из актуальных задач нефтедобывающей промышленности в связи с ростом доли трудноизвлекаемых запасов нефти в стране.
---------------------------------------------------
Доля трудноизвлекаемых запасов нефти в России составляет около 55 % от общих запасов российской нефти. Основные месторождения высоковязкой (трудноизвлекаемой) нефти расположены в Пермской области, Татарстане, Башкирии и Уд




AssertionError: 

# Ret-Robust

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch

MODEL_NAME = "qwen-1.5b-ret-robust"

BATCH_SIZE = 16

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG, предоставляющая точные и релевантные ответы на вопросы, используя только предоставленную контекстную информацию. Отвечай на русском языке."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Ret-Robust_and_Spring/qwen1_5b-v100-bs_12_3-1_epoch-ret-robust/pretrain_save",
    device_map="auto",)
model.eval()


# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n"
        "---\n"
        "# Инструкции:\n"
        "1. Отвечай максимально кратко и точно, используя только предоставленный контекст.\n"
        f"2. Если в контексте нет необходимой информации для ответа, напиши: \"{REJECT_ANSW}\".\n"
        "3. Запрещено использовать любые внешние знания или информацию вне контекста.\n"
        f"# Вопрос:\n{question}\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-19 14:14:15.042451: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747653255.061877    4343 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747653255.068071    4343 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747653255.083581    4343 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747653255.083597    4343 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747653255.083600    4343 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

#   Saiga 500

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch

MODEL_NAME = "qwen-1.5b-ru-500"

BATCH_SIZE = 16

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG, предоставляющая точные и релевантные ответы на вопросы, используя только предоставленную контекстную информацию. Отвечай на русском языке."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-1.5B-Instruct",
    # "../Alex/qwen1_5b-v100-bs_12_2-1epoch/checkpoint-500",
    device_map="auto",)
model.eval()


# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n"
        "---\n"
        "# Инструкции:\n"
        "1. Отвечай максимально кратко и точно, используя только предоставленный контекст.\n"
        f"2. Если в контексте нет необходимой информации для ответа, напиши: \"{REJECT_ANSW}\".\n"
        "3. Запрещено использовать любые внешние знания или информацию вне контекста.\n"
        f"# Вопрос:\n{question}\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-19 15:29:29.813619: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747657769.833998    4785 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747657769.840359    4785 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747657769.855962    4785 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747657769.855976    4785 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747657769.855978    4785 computation_placer.cc:177] computation placer alr

In [None]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        print(batch[j]['table_data']['Вопрос'])
        print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        print(batch[j]['table_data']['Ответ'])
        print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

In [None]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        print(batch[j]['table_data']['Вопрос'])
        print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        print(batch[j]['table_data']['Ответ'])
        print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

# qwen 3b 

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "qwen-3b-webglm"



BATCH_SIZE = 8

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Alex/qwen_3b-v100-webglm/merged_model",
    device_map="auto",)
model.eval()

# "Qwen/Qwen2.5-3B-Instruct"
# "../Alex/qwen_3b-v100-webglm/merged_model"
# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай полный и краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-20 13:03:06.513501: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747735386.535055    3456 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747735386.541252    3456 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747735386.557270    3456 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747735386.557285    3456 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747735386.557287    3456 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()
        # print()
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [1:26:06<00:00, 60.79s/it]


# qwen 3b raft

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "qwen-3b-raft"



BATCH_SIZE = 8

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Semyon/experiments/qwen3b-v100-webglm-raft/merged_model",
    device_map="auto",)
model.eval()

# "Qwen/Qwen2.5-3B-Instruct"
# "../Alex/qwen_3b-v100-webglm/merged_model"
# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай полный и краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-20 14:59:14.472088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747742354.494442    6546 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747742354.501566    6546 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747742354.520008    6546 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747742354.520027    6546 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747742354.520029    6546 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()
        # print()
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [3:09:39<00:00, 133.88s/it]  


# ret-robust 3b

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch

MODEL_NAME = "qwen-3b-ret-robust"

BATCH_SIZE = 8

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG, предоставляющая точные и релевантные ответы на вопросы, используя только предоставленную контекстную информацию. Отвечай на русском языке."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Ret-Robust_and_Spring/qwen3b-v100-ret-robust/checkpoint-160",
    device_map="auto",)
model.eval()


# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n"
        "---\n"
        "# Инструкции:\n"
        "1. Отвечай максимально кратко и точно, используя только предоставленный контекст.\n"
        f"2. Если в контексте нет необходимой информации для ответа, напиши: \"{REJECT_ANSW}\".\n"
        "3. Запрещено использовать любые внешние знания или информацию вне контекста.\n"
        f"# Вопрос:\n{question}\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-05-22 18:20:02.228168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747927202.251324   40347 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747927202.258905   40347 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747927202.278125   40347 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747927202.278142   40347 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747927202.278145   40347 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [1:29:44<00:00, 63.35s/it]


# qwen 3b raw-raft

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "qwen-3b-raw-raft"



BATCH_SIZE = 8

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Semyon/experiments/qwen3b-v100-raft/merged_model",
    device_map="auto",)
model.eval()

# "Qwen/Qwen2.5-3B-Instruct"
# "../Alex/qwen_3b-v100-webglm/merged_model"
# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-06-14 19:38:57.920490: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749919137.941867    9352 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749919137.949040    9352 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749919137.966941    9352 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749919137.966961    9352 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749919137.966981    9352 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()
        # print()
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

100%|██████████| 85/85 [2:41:34<00:00, 114.05s/it]  


# qwen 1.5b webglm raft final

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "qwen-1.5b-raft-final"



BATCH_SIZE = 8

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Semyon/experiments/qwen1_5b-v100-webglm-raft/merged_model",
    device_map="auto",)
model.eval()

# "Qwen/Qwen2.5-3B-Instruct"
# "../Alex/qwen_3b-v100-webglm/merged_model"
# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай полный и краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-06-15 22:57:57.486379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750017477.505123   58051 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750017477.511313   58051 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750017477.526361   58051 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750017477.526376   58051 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750017477.526378   58051 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()
        # print()
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [2:01:37<00:00, 85.85s/it]  


# qwen 1.5b webglm final

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from transformers import AutoModelForCausalLM
import json
import torch



MODEL_NAME = "qwen-1.5b-webglm-final"



BATCH_SIZE = 8

SYSTEM = {"role": "system", "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    "../Semyon/experiments/qwen1_5b-v100-webglm-final/merged_model",
    device_map="auto",)
model.eval()

# "Qwen/Qwen2.5-3B-Instruct"
# "../Alex/qwen_3b-v100-webglm/merged_model"
# "../Alex/qwen1_5b-v100-bs_12_2-1epoch-webglm_ft/merged_model",
# "../Semyon/experiments/qwen1_5b-v100-ru-raft/merged_model"


REJECT_ANSW = "К сожалению, ответа на вопрос нет в упомянутых источниках"

def get_summary_prompt(context_list, question):
    context = ''
    for i, c in enumerate(context_list):
        # if i == 5:
        #     break
        context += f'Источник [{i+1}], документ {c["doc"]}:'+"\n"+c['text']+"\n\n"

    prompt = (
        f"# Контекстная информация:\n\n{context}\n\n"
        "---\n"
        "# Инструкции:\n\n"
        "1. Дай полный и краткий ответ на вопрос, используя только информацию из контекста.\n"
        f"2. Если ответа на вопрос нет в источниках, напиши: \"{REJECT_ANSW}\".\n"
        f"# Вопрос:\n\n{question}\n\n"
    )

    return prompt

def generate_conversation(row):
    formatted_message = [SYSTEM] + [
        {"role": "user", "content": get_summary_prompt(row['context'], row['table_data']['Вопрос'])},
    ]
    return formatted_message

2025-06-16 01:03:32.355293: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750025012.374087  115913 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750025012.379911  115913 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750025012.395482  115913 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750025012.395498  115913 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750025012.395500  115913 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
device = model.device

with open("benchmark.json", "r") as f:
    benchmark_data = json.load(f)


for i in tqdm(range(0, len(benchmark_data), BATCH_SIZE)):
    batch = benchmark_data[i:i+BATCH_SIZE]
    prompts = [generate_conversation(row) for row in batch]
    chat_prompts = tokenizer.apply_chat_template(
        prompts,
        tokenize=False,
        add_generation_prompt=True
    )
        
    inputs = tokenizer(
        chat_prompts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=5120
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None
        )
    
    decoded = tokenizer.batch_decode(
        outputs[:, inputs.input_ids.shape[1]:], 
        skip_special_tokens=True,
    )
    
    for j, item in enumerate(batch):
        # print(batch[j]['table_data']['Вопрос'])
        # print(f"---------------------------------------------------\n{decoded[j]}\n---------------------------------------------------")
        # print(batch[j]['table_data']['Ответ'])
        # print()
        # print()
        # print()

        item["model_answer"] = decoded[j]

    # assert(0 == 1)


with open(f"generation/{MODEL_NAME}.json", "w") as f:
    json.dump(benchmark_data, f, ensure_ascii=False, indent=2)

100%|██████████| 85/85 [1:00:40<00:00, 42.83s/it]
