In [1]:
import torch
import warnings
import numpy as np
import pandas as pd
import os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM
from utils.training import TrainingModel
from utils.preprocess_dataset_for_train import preprocess_dataset

warnings.filterwarnings('ignore')

In [2]:
# model_name = 'facebook/opt-125m'
model_name = 'Qwen/Qwen2-1.5B-Instruct'
# path_to_save_outputs= 'facebook_opt-125m_2'
path_to_save_outputs = 'Qwen_Qwen2_1.5B_Instruct'
path_save_model=f'train_model/{path_to_save_outputs}'
seed = 42
q = 0.98
max_length = 400 # проверить кол-во максимально токенов 
type_tuning='lora'
# run_name='facebook_opt-125m_1'
run_name = 'Qwen_Qwen2_1.5B_Instruct_lora'

# Global settings

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=str(device),
    torch_dtype=torch.float16
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [6]:
print("GPU memory allocated: %fGB" % (torch.cuda.memory_allocated() / 2**30))
print("Max GPU memory: %fGB" % (torch.cuda.get_device_properties('cuda').total_memory / 2**30))

GPU memory allocated: 2.876247GB
Max GPU memory: 11.993530GB


# Dataset

In [7]:
df = pd.read_excel('data/df_with_new_question.xlsx', index_col=0)
dataset_excel = df[['new_question', 'description']]

In [8]:
dataset_excel['len_description'] = dataset_excel['description'].apply(lambda row: len(row.split(' ')))

In [9]:
# dataset_excel = dataset_excel[dataset_excel.len_description <= 250]

In [10]:
np.random.seed(seed)
eval_index = np.random.choice(len(dataset_excel), size=int(len(dataset_excel) * 0.05), replace=False)

In [11]:
train = dataset_excel[~dataset_excel.index.isin(eval_index)]
test = dataset_excel[dataset_excel.index.isin(eval_index)]
len(train), len(test)

(6428, 338)

In [12]:
dataset = DatasetDict({
    'train': Dataset.from_dict(train.to_dict('list')),
    'test': Dataset.from_dict(test.to_dict('list')),
})

dataset['train'][0]

{'new_question': 'Что является критерием для выполнения мер защиты населения при большом аварийном выбросе на АС?',
 'description': 'Большой аварийный выброс - выброс радиоактивных веществ в окружающую среду при аварии на АС, при котором необходимо выполнение мер защиты населения на границе зоны планирования защитных мероприятий на начальном периоде аварии, установленной в соответствии с требованиями норм и правил по размещению АС, и за ее пределами.',
 'len_description': 46}

In [13]:
token_lengths = []
for text in train['description']:
    token_len = len(tokenizer.encode(text))
    token_lengths.append(token_len)
    
avg_token_length = np.mean(token_lengths)
max_token_length = np.max(token_lengths)
quant_token_length = np.quantile(token_lengths, q)
print(f"Среднее количество токенов: {avg_token_length}")
print(f"Максимальное количество токенов: {max_token_length}")
print(f"Квантиль {q} количество токенов: {quant_token_length}")

Среднее количество токенов: 125.64779091474797
Максимальное количество токенов: 1710
Квантиль 0.98 количество токенов: 486.9200000000001


In [14]:
dataset = dataset.map(preprocess_dataset,
                      batched=True,
                      fn_kwargs={
                          'tokenizer': tokenizer,
                          'max_length': max_length,
                          'promts': 'new_question',
                          'answers': 'description'
                      }
)

dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/6428 [00:00<?, ? examples/s]

Map:   0%|          | 0/338 [00:00<?, ? examples/s]

In [15]:
dataset['train']

Dataset({
    features: ['new_question', 'description', 'len_description', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 6428
})

# Train model

In [16]:
def load_latest_checkpoint(checkpoints_dir):
    # Получаем список всех файлов в директории checkpoints_dir
    files = os.listdir(checkpoints_dir)
    
    # Отфильтровываем файлы, соответствующие шаблону имени чекпоинтов
    checkpoint_files = [f for f in files if 'checkpoint' in f]
    
    # Сортируем файлы по времени модификации (самый новый будет последним)
    checkpoint_files.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoints_dir, x)))

    return checkpoint_files[-1]

checkpoints_dir = f'outputs/{path_to_save_outputs}'
checkpoint_path = f'outputs/{path_to_save_outputs}/' + load_latest_checkpoint(checkpoints_dir)
checkpoint_path

'outputs/Qwen_Qwen2_1.5B_Instruct/checkpoint-16000'

In [17]:
# def clear_cuda_memory():
#     # Освобождаем кэш PyTorch
#     torch.cuda.empty_cache()

#     # Вызываем сборщик мусора
#     import gc
#     gc.collect()

# # Пример использования
# model = ...  # Ваша модель
# optimizer = ...  # Ваш оптимизатор

# # После завершения работы с моделью и оптимизатором
# clear_cuda_memory()

In [18]:
model_ft = TrainingModel(
    model_name=model_name,
    type_tuning=type_tuning,
    run_name=run_name,
    output_dir=f'outputs/{path_to_save_outputs}',
    lr=3e-4,
    epochs=20,
    save_steps=500,
    lora_r = 8,
    lora_dropout = 0.1,
    device=str(device),
    report_flag=True,
    # continue_from_checkpoint=False,
    continue_from_checkpoint=True,
    # checkpoint_path=None,
    checkpoint_path=checkpoint_path,
)

# print('Метрики до обучения:')
# print(model_ft.eval_model(dataset['test']))
print('Обучение...')
print(model_ft.train_model(dataset['train'], save_model=True, path_save_model=path_save_model))
# print('Метрики после обучения:')
# print(model_ft.eval_model(dataset['test'], path_save_model=path_save_model))

cuda


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Обучение...
trainable params: 1,089,536 || all params: 1,544,803,840 || trainable%: 0.0705


git root error: Cmd('git') failed due to: exit code(128)
  cmdline: git rev-parse --show-toplevel
  stderr: 'fatal: detected dubious ownership in repository at 'D:/study/NLP/chat_bot_aep'
'D:/study/NLP/chat_bot_aep/.git' is owned by:
	BUILTIN/�������������� (S-1-5-32-544)
but the current user is:
	Go_S_A/Sergey (S-1-5-21-1745163764-2905533956-3809126793-1001)
To add an exception for this directory, call:

	git config --global --add safe.directory D:/study/NLP/chat_bot_aep'
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: gorbatovs1997 (gorbatovs-banking) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
16100,0.8062
16200,0.7983
16300,0.7846
16400,0.8019
16500,0.8006
16600,0.7858
16700,0.7916
16800,0.8188
16900,0.8127
17000,0.8205


Модель сохранена в train_model/Qwen_Qwen2_1.5B_Instruct
<transformers.trainer.Trainer object at 0x00000139D913D6A0>
