In [1]:
!nvidia-smi

Tue May 28 09:38:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Импорт библиотек

In [2]:
import torch
from transformers import AutoModelForQuestionAnswering
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DefaultDataCollator
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import copy
import string
import numpy as np
from mealpy import FloatVar, DE

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

# Подготовка

## Инициализация модели

In [5]:
tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruBert-base')

config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained('ai-forever/ruBert-base').to(DEVICE)

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

## Загрузка набора данных SberQuAD

In [7]:
sberquad = load_dataset('sberquad')
sberquad

Downloading builder script:   0%|          | 0.00/4.22k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.96k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.84M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45328 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5036 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/23936 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 45328
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5036
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 23936
    })
})

In [9]:
# Функция для обработки набора данных
def preprocess_function(examples):
    # Тексты вопросов
    questions = [q.strip() for q in examples['question']]
    # Тексты контекстов
    contexts = [c.strip() for c in examples['context']]

    # Входы для модели
    inputs = tokenizer(
        questions,
        contexts,
        max_length=256,
        truncation='only_second',
        padding='max_length',
        return_offsets_mapping=True,
        return_tensors='pt',
    )

    offset_mapping = inputs.pop('offset_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    # Поиск стартовых и конечных позиций ответов
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = answer['answer_start'][0] + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Если ответ не полностью внутри контекста, отметить начало и конец за (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions

    return inputs

In [10]:
# Размеры мини-выборок из SberQuAD для алгоритма дифференциальной эволюции
mini_train_size = 1000
mini_valid_size = 500

In [11]:
# Перемешанный датасет
shuffled = sberquad.shuffle(seed=2209)

# Мини-выборка для оптимизации гиперпараметров
mini_sberquad_train = shuffled['train'].select(range(mini_train_size))
mini_sberquad_valid = shuffled['validation'].select(range(mini_valid_size))

In [12]:
# Преобразование набора обучающих данных в эмбеддинги
mini_sberquad_train = mini_sberquad_train.map(preprocess_function,
                                              batched=True,
                                              remove_columns=mini_sberquad_train.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
# Извлечение правильных ответов из валидационной выборки
validation_answers = [answer['text'][0]
                      for answer in mini_sberquad_valid['answers']]

## Метрики для модели

In [14]:
# Нормализация текста: убрать лишние пробелы и знаки пунктуации
def normalize_text(s):
    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        for exclude in string.punctuation:
            text = text.replace(exclude, ' ')
        return text

    return white_space_fix(remove_punc(s.lower()))

# Метрика F1 для текста
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    return 2 * (precision * recall) / (precision + recall)

## Фитнес-функция для DE

In [15]:
def main_fitness(x):
    # Интерпретация входа в виде гиперпараметров
    b1, b2, lr, wd = x

    # Создание копии модели
    agent_model = copy.deepcopy(model)

    # Конфигурация процесса обучения
    training_args = TrainingArguments(
        output_dir='/tmp',
        save_strategy='no',
        per_device_train_batch_size=16,
        num_train_epochs=1,
        learning_rate=lr,
        weight_decay=wd,
        adam_beta1=b1,
        adam_beta2=b2,
        seed=2209,
        data_seed=2209,
        disable_tqdm=True
    )

    # Инициализация объекта для обучения модели
    trainer = Trainer(
        model=agent_model,
        args=training_args,
        train_dataset=mini_sberquad_train,
        tokenizer=tokenizer,
        data_collator=DefaultDataCollator(),
    )

    # Запуск процесса обучения
    trainer.train()

    # Инициализация интерфейса для вопросно-ответной модели
    qa_pipeline = pipeline('question-answering',
                           model=agent_model,
                           tokenizer=tokenizer)

    # Ответы модели на валидационном датасете
    predicted_answers = qa_pipeline(question=mini_sberquad_valid['question'],
                                    context=mini_sberquad_valid['context'])

    # Расчет F1-меры для всех объектов валидационной выборки
    f1_results = [compute_f1(pred['answer'], truth)
                  for pred, truth in zip(predicted_answers,
                                         validation_answers)]

    # Средняя оценка F1 по всей валидационной выборке
    mean_f1 = np.mean(f1_results)
    print(' > f1:', mean_f1, f'{(b1, b2, lr, wd)}')

    return mean_f1

# Дифференциальная эволюция

In [16]:
%%time

# Диапазоны подбора значений
bounds = [
    (0.6, 0.9999), # beta 1
    (0.8, 0.9999), # beta 2
    (1e-6, 1e-2),  # learning rate
    (0, 1e-1),     # weight decay
]

# Описание задачи оптимизации
problem_dict = {
    'bounds': FloatVar(lb=[bound[0] for bound in bounds],
                       ub=[bound[1] for bound in bounds]),
    'minmax': 'max',
    'obj_func': main_fitness
}

# Инициализация алгоритма DE
optimizer = DE.OriginalDE(epoch=4, pop_size=20,
                          wf=0.5, cr=0.7,
                          strategy=1,
                          seed=2209)

# Запуск алгоритма
g_best = optimizer.solve(problem_dict)
print('Solution:', g_best.solution)
print('Fitness:', g_best.target.fitness)

{'train_runtime': 104.8789, 'train_samples_per_second': 9.535, 'train_steps_per_second': 0.601, 'train_loss': 6.275709364149305, 'epoch': 1.0}


INFO:mealpy.evolutionary_based.DE.OriginalDE:Solving single objective optimization problem.


 > f1: 0.059465483209600864 (0.9638203315824687, 0.8742710057963523, 0.008931552823505962, 0.0021755370536997386)
{'train_runtime': 104.2727, 'train_samples_per_second': 9.59, 'train_steps_per_second': 0.604, 'train_loss': 6.248460557725695, 'epoch': 1.0}
 > f1: 0.052273983532807065 (0.926354082787163, 0.8885127042835124, 0.006316421553765208, 0.08756622387324167)
{'train_runtime': 104.7103, 'train_samples_per_second': 9.55, 'train_steps_per_second': 0.602, 'train_loss': 6.259676009889633, 'epoch': 1.0}
 > f1: 0.056651187340351426 (0.8128740067738456, 0.9557988402385269, 0.008015630846131758, 0.031196286481969416)
{'train_runtime': 104.242, 'train_samples_per_second': 9.593, 'train_steps_per_second': 0.604, 'train_loss': 6.257745167565724, 'epoch': 1.0}
 > f1: 0.059781213087775203 (0.8469139894323805, 0.8993003290467253, 0.008254092229627367, 0.006310777471731544)
{'train_runtime': 104.5871, 'train_samples_per_second': 9.561, 'train_steps_per_second': 0.602, 'train_loss': 6.25654965355

INFO:mealpy.evolutionary_based.DE.OriginalDE:>>>Problem: P, Epoch: 1, Current best: 0.5541026891409244, Global best: 0.5541026891409244, Runtime: 2268.85452 seconds


 > f1: 0.06978370662705027 (0.9535614038531713, 0.8988041678248213, 0.003306525741123264, 0.06349488776054135)
{'train_runtime': 105.1329, 'train_samples_per_second': 9.512, 'train_steps_per_second': 0.599, 'train_loss': 6.14208742171999, 'epoch': 1.0}
 > f1: 0.08206347644443619 (0.9506092127637353, 0.9999, 1e-06, 0.004230567379155958)
{'train_runtime': 104.6699, 'train_samples_per_second': 9.554, 'train_steps_per_second': 0.602, 'train_loss': 3.4324137369791665, 'epoch': 1.0}
 > f1: 0.5692116687749815 (0.9849116163825746, 0.899416923524234, 0.00016779708516160956, 0.031196286481969416)
{'train_runtime': 105.0801, 'train_samples_per_second': 9.517, 'train_steps_per_second': 0.6, 'train_loss': 4.920389811197917, 'epoch': 1.0}
 > f1: 0.159429206981529 (0.9999, 0.9999, 2.2911735133159124e-05, 0.03461203647504771)
{'train_runtime': 105.0291, 'train_samples_per_second': 9.521, 'train_steps_per_second': 0.6, 'train_loss': 6.141455271887401, 'epoch': 1.0}
 > f1: 0.0816634764444362 (0.9999, 0.

INFO:mealpy.evolutionary_based.DE.OriginalDE:>>>Problem: P, Epoch: 2, Current best: 0.6364740886564416, Global best: 0.6364740886564416, Runtime: 2271.38098 seconds


 > f1: 0.6178667945401691 (0.9535614038531713, 0.8988041678248213, 0.00016036599563817546, 0.04586810660876382)
{'train_runtime': 105.0481, 'train_samples_per_second': 9.519, 'train_steps_per_second': 0.6, 'train_loss': 6.142616393074157, 'epoch': 1.0}
 > f1: 0.08271961679531338 (0.7180008086101621, 0.8, 1e-06, 0.004230567379155958)
{'train_runtime': 105.0665, 'train_samples_per_second': 9.518, 'train_steps_per_second': 0.6, 'train_loss': 6.141673738994296, 'epoch': 1.0}
 > f1: 0.08206347644443619 (0.9319333808299219, 0.8736972734908447, 1e-06, 0.04415276908122782)
{'train_runtime': 104.9263, 'train_samples_per_second': 9.53, 'train_steps_per_second': 0.6, 'train_loss': 3.2631763276599703, 'epoch': 1.0}
 > f1: 0.6176674567262803 (0.874948602927081, 0.8809309732767485, 0.0004622214872447199, 0.03461203647504771)
{'train_runtime': 104.8637, 'train_samples_per_second': 9.536, 'train_steps_per_second': 0.601, 'train_loss': 3.2504420204768105, 'epoch': 1.0}
 > f1: 0.6101291392503938 (0.9012

INFO:mealpy.evolutionary_based.DE.OriginalDE:>>>Problem: P, Epoch: 3, Current best: 0.6577179747325258, Global best: 0.6577179747325258, Runtime: 2273.89300 seconds


 > f1: 0.6446827137430853 (0.845209117331658, 0.8690671066511764, 0.00023379988552164703, 0.04586810660876382)
{'train_runtime': 104.8447, 'train_samples_per_second': 9.538, 'train_steps_per_second': 0.601, 'train_loss': 6.142793201264881, 'epoch': 1.0}
 > f1: 0.08329104536674195 (0.7180008086101621, 0.8922418321554915, 1e-06, 0.004230567379155958)
{'train_runtime': 104.8771, 'train_samples_per_second': 9.535, 'train_steps_per_second': 0.601, 'train_loss': 3.296231224423363, 'epoch': 1.0}
 > f1: 0.6414382057399084 (0.8794532292474996, 0.9345409371449168, 0.00012623076349143147, 0.031196286481969416)
{'train_runtime': 104.9033, 'train_samples_per_second': 9.533, 'train_steps_per_second': 0.601, 'train_loss': 3.292572990296379, 'epoch': 1.0}
 > f1: 0.6379613236952246 (0.874948602927081, 0.8864182117375966, 0.00014587865135754738, 0.04803579366895503)
{'train_runtime': 104.8971, 'train_samples_per_second': 9.533, 'train_steps_per_second': 0.601, 'train_loss': 3.196628873310392, 'epoch': 1

INFO:mealpy.evolutionary_based.DE.OriginalDE:>>>Problem: P, Epoch: 4, Current best: 0.6614949757500842, Global best: 0.6614949757500842, Runtime: 2270.57097 seconds


 > f1: 0.6522898873744075 (0.8427758133131947, 0.8690671066511764, 0.00032257621266158684, 0.04586810660876382)
Solution: [8.20547739e-01 8.69569141e-01 3.27645561e-04 1.00000000e-01]
Fitness: 0.6614949757500842
CPU times: user 3h 9min 20s, sys: 51.3 s, total: 3h 10min 11s
Wall time: 3h 11min 1s


In [17]:
# Вывод найденных значений гиперпараметров
[print(f'{name:>13} = {val}')
 for name, val in zip(('beta_1', 'beta_2', 'learning_rate', 'weight_decay'),
                      g_best.solution)];

       beta_1 = 0.8205477391624002
       beta_2 = 0.8695691410837043
learning_rate = 0.00032764556111556053
 weight_decay = 0.1
