In [None]:
# Установка зависимостей
!pip install clearml
!pip install evaluate -q
!pip install nltk -q

In [None]:
# Импортируем необходимые библиотеки
import math
import random

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from datasets import load_dataset
from evaluate import load
from tqdm import tqdm
from torch import nn
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from transformers.integrations import ClearMLCallback

from clearml import Task

In [None]:
# Настраиваем окружения ClearML 
%env CLEARML_WEB_HOST=https://app.clear.ml/
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=''
%env CLEARML_API_SECRET_KEY=''

In [None]:
# Инициализируем эксперимент в ClearML
task_lora_pt = Task.init(
        project_name="PEFT",
        task_name="LoRA_PT_10epoch",
        task_type="training"
)
task:Task

logger = task_lora_pt.get_logger()

In [None]:
# Логируем основные гиперпараметры эксперимента
task_lora_pt.connect({
    "model": "ruT5-large",
    "method": "LoRA",
    "lora_rank": 8,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "gradient_accumulation_steps": 4,
    "gradient_checkpointing": False,
    "learning_rate": 3e-4,
    "weight_decay": 0.01,
    "num_train_epochs": 10,
    "fp16": True,
    "max_input_length": 100,
    "max_output_length": 200,
})

In [None]:
# Фиксируем сиды для воспроизводимости экспериментов
def fix_seeds(seed: int):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed) 

fix_seeds(0)

In [None]:
# Выбираем устройство для обучения
device = torch.device('cuda')
device

In [None]:
# Загружаем обучающий и тестовый датасеты 
data_files = {'train': '/kaggle/input/ru_instruct_gpt4_train.tsv', 'test': '/kaggle/input/ru_instruct_gpt4_test.tsv'}
dataset = load_dataset('csv', data_files=data_files, sep='\t')

In [None]:
dataset['train'][0]

In [None]:
# Загружаем токенизатор и базовую модель ruT5-large
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("ai-forever/ruT5-large", device_map='cuda')

In [None]:
# Функция предобработки данных - объединяет инструкцию и контекст, токенизирует тексты
def preprocess(samples: dict) -> dict:
    input_texts = [f'{instr}\n\n{inp}' for instr, inp in zip(samples['instruction'], samples['input'])]
    
    input_tokenized = tokenizer(
        input_texts,
        max_length=100,
        truncation=True,
        return_token_type_ids=False
    )['input_ids']

    output_tokenized = tokenizer(
        samples['output'],
        max_length=200,
        truncation=True,
        return_token_type_ids=False
    )['input_ids']

    return {
        'input_ids': input_tokenized,
        'labels': output_tokenized
    }

In [None]:
# Выполняем предобработку и удаляем неиспользуемое поле full_output
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=['full_output'])

In [None]:
# Загружаем метрики качества генерации
bleu_metric = load("bleu")
rouge_metric = load("rouge")
meteor_metric = load("meteor")

# Функция для вычисления метрик BLEU, ROUGE и METEOR для оценки качества генерации
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    if len(predictions.shape) == 3:
        predictions = np.argmax(predictions, axis=-1)
    
    predictions = np.array(predictions, dtype=np.int32)
    
    vocab_size = len(tokenizer)
    predictions = np.clip(predictions, 0, vocab_size - 1)
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = np.array(labels, dtype=np.int32)
    labels = np.clip(labels, 0, vocab_size - 1)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [[label] for label in decoded_labels]
    
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_result = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)

    
    return {
        'bleu': bleu_result['bleu'],
        'rouge1': rouge_result['rouge1'],
        'rouge2': rouge_result['rouge2'],
        'rougeL': rouge_result['rougeL'],
        'meteor': meteor_result['meteor']
    }

In [None]:
# Базовая генерация до дообучения для сравнения

lm_text='Придумайте название для книги на основе следующего описания сюжета.\n\nМолодой парень находит портал в параллельный мир, где его жизнь намного хуже, чем она была до этого.'

input_ids = torch.tensor([tokenizer.encode(lm_text)]).to(device)

outputs = model.generate(input_ids, eos_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0][1:]))

In [None]:
# LoRA-адаптер для слоев внимания

class LoRALayer(nn.Module):
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()

        self.module = module
        self.adapter_A = nn.Parameter(
            torch.empty(module.in_features, rank, device=module.weight.device)
        )
        self.adapter_B = nn.Parameter(
            torch.zeros(rank, module.out_features, device=module.weight.device)
        )

        nn.init.kaiming_uniform_(self.adapter_A, a=math.sqrt(5))

    def forward(self, hidden_states):
        output = self.module(hidden_states)
        lora_delta = hidden_states @ self.adapter_A @ self.adapter_B
        output += lora_delta
        return output

In [None]:
# Функция для добавления LoRA-адаптеров к механизму внимания модели

def add_lora(model, lora_rank=8):
    
    device = next(iter(model.parameters())).device
    
    for name, module in model.named_modules():
        if hasattr(module, 'q') and isinstance(module.q, nn.Linear):
            module.q = LoRALayer(module.q, lora_rank).to(device)
        
        if hasattr(module, 'v') and isinstance(module.v, nn.Linear):
            module.v = LoRALayer(module.v, lora_rank).to(device)
    
    return model

In [None]:
# Prompt Tuning - добавляет обучаемые промпт-эмбеддинги в начало последовательности
class PromptTuningEmbedding(nn.Module):

    def __init__(self, embed_tokens, prompt_size: int):
        super().__init__()
        if isinstance(embed_tokens, PromptTuningEmbedding):
            self.embed_tokens = embed_tokens.embed_tokens
        else:
            self.embed_tokens = embed_tokens
        
        self.prompt_size = prompt_size
        if isinstance(self.embed_tokens, nn.Embedding):
            embedding_dim = self.embed_tokens.weight.shape[-1]
        else:
            embedding_dim = getattr(self.embed_tokens, 'embedding_dim', None)
            if embedding_dim is None:
                embedding_dim = getattr(self.embed_tokens, 'output_dim', None)
            if embedding_dim is None:
                try:
                    device = next(self.embed_tokens.parameters()).device
                except (StopIteration, AttributeError):
                    if hasattr(self.embed_tokens, 'weight'):
                        device = self.embed_tokens.weight.device
                    else:
                        device = torch.device('cpu')
                test_input = torch.tensor([[0]], device=device)
                test_output = self.embed_tokens(test_input)
                embedding_dim = test_output.shape[-1]
        
        self.learnable_prompts = nn.Parameter(
            torch.randn(1, prompt_size, embedding_dim), requires_grad=True
        )

    def forward(self, input_ids):
        if input_ids.shape[1] >= self.prompt_size:
            real_input_ids = input_ids[:, self.prompt_size:]
            inputs_embeds = self.embed_tokens(real_input_ids)
        else:
            inputs_embeds = self.embed_tokens(input_ids)

        batch_size = len(input_ids)
        prompt_embeds = (
            self.learnable_prompts
            .expand(batch_size, self.prompt_size, -1)
        )
        
        embeds = torch.cat((prompt_embeds, inputs_embeds), dim=1)

        return embeds

In [None]:
# DataCollator для корректной обработки промпт-токенов в батчах

class PTDataCollator(DataCollatorForSeq2Seq):
    def __init__(self, prompt_size: int, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prompt_size = prompt_size

    def __call__(self, features):
        if features and len(features) > 0:
            output = super().__call__(features)
        else:
            return None
        
        if output is None or 'attention_mask' not in output or 'input_ids' not in output:
            return output
        
        attention_mask = output['attention_mask']
        input_ids = output['input_ids']
        
        batch_size = attention_mask.shape[0]
        device = attention_mask.device
        dtype = attention_mask.dtype
        
        prompt_mask = torch.ones(batch_size, self.prompt_size, dtype=dtype, device=device)
        
        pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
        prompt_input_ids = torch.full(
            (batch_size, self.prompt_size), 
            pad_token_id, 
            dtype=input_ids.dtype, 
            device=device
        )
        
        output['attention_mask'] = torch.cat([prompt_mask, attention_mask], dim=1)
        output['input_ids'] = torch.cat([prompt_input_ids, input_ids], dim=1)
        
        return output
        

In [None]:
# Применяем Prompt Tuning и LoRA к модели

prompt_size = 20
lora_rank = 8

model.encoder.embed_tokens = PromptTuningEmbedding(model.encoder.embed_tokens, prompt_size).to(device)
pt_lora_model = add_lora(model, lora_rank=lora_rank)

In [None]:
# Замораживаем базовые веса, обучаем только промпты и LoRA-адаптеры

for name, param in pt_lora_model.named_parameters():
    if 'learnable_prompts' in name or 'adapter_A' in name or 'adapter_B' in name:
        continue
    else:
        param.requires_grad = False

In [None]:
# Считаем долю обучаемых параметров

model_params = sum(p.numel() for p in pt_lora_model.parameters())
trainable_params = sum(p.numel() for p in pt_lora_model.parameters() if p.requires_grad)
print(
    f'All params: %s | Trainable params: %s | Trainable %%: %.4f' % \
    (model_params, trainable_params, round(trainable_params / model_params, 4))
)

In [None]:
# Создаем DataCollator для обработки батчей с промпт-токенами 

data_collator = PTDataCollator(
    prompt_size, tokenizer, model=pt_lora_model, padding=True, label_pad_token_id=-100
)

In [None]:
# Настраиваем параметры обучения модели

training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=3e-4,
    weight_decay=0.01,
    fp16=True,
    num_train_epochs=10,
    output_dir="./results",
    logging_steps=5,
    logging_strategy="steps",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    save_total_limit=2,
    report_to=[],
    disable_tqdm=False,
    dataloader_pin_memory=False,
    predict_with_generate=True,
    dataloader_num_workers=0, 
    save_safetensors=False,
)
trainer = Seq2SeqTrainer(
    model=pt_lora_model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    args=training_args,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[ClearMLCallback()]
)

torch.cuda.empty_cache()

# Обучаем модель
trainer.train()

# Сохраняем лучшую модель
best_model_path = trainer.state.best_model_checkpoint

# Загружаем модель в ClearML
if best_model_path:
    task_lora_pt.upload_artifact(
        name="best_model",
        artifact_object=best_model_path
    )
    print(f"Модель сохранена в ClearML: {best_model_path}")
else:
    print("Путь к лучшей модели не найден")

In [None]:
# Функция для подготовки входного текста - объединяет инструкцию и контекст
def prepare_input_texts(df_row):
    instruction = str(df_row['instruction']) if pd.notna(df_row['instruction']) else ''
    input_text = str(df_row['input']) if pd.notna(df_row['input']) else ''
    
    if input_text:
        return f'{instruction}\n\n{input_text}'
    else:
        return instruction

# Функция для токенизации входного текста с учетом промпт-токенов - добавляет фиктивные input_ids и расширяет attention_mask
def prepare_inputs_with_prompts(tokenizer, texts, prompt_size, device, max_length=100):
    encoded = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors='pt'
    ).to(device)
    
    current_batch_size = encoded['input_ids'].shape[0]
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
    
    prompt_input_ids = torch.full(
        (current_batch_size, prompt_size),
        pad_token_id,
        dtype=encoded['input_ids'].dtype,
        device=device
    )
    
    extended_input_ids = torch.cat([prompt_input_ids, encoded['input_ids']], dim=1)
    
    prompt_mask = torch.ones(
        current_batch_size,
        prompt_size,
        dtype=encoded['attention_mask'].dtype,
        device=device
    )
    extended_attention_mask = torch.cat([prompt_mask, encoded['attention_mask']], dim=1)
    
    return extended_input_ids, extended_attention_mask

# Функция для генерирации предсказаний
def generate_predictions(
    model, 
    tokenizer, 
    input_texts, 
    prompt_size, 
    device,
    max_length=200,
    num_beams=4,
    temperature=0.2,
    top_k=30,
    top_p=0.95,
    do_sample=True,
    **generate_kwargs
):
    extended_input_ids, extended_attention_mask = prepare_inputs_with_prompts(
        tokenizer, input_texts, prompt_size, device
    )
    
    default_kwargs = {
        'max_length': max_length,
        'num_beams': num_beams,
        'temperature': temperature,
        'top_k': top_k,
        'top_p': top_p,
        'do_sample': do_sample,
        'early_stopping': True,
        'eos_token_id': tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
        'pad_token_id': tokenizer.pad_token_id
    }
    default_kwargs.update(generate_kwargs)
    
    outputs = model.generate(
        extended_input_ids,
        attention_mask=extended_attention_mask,
        **default_kwargs
    )
    
    predictions = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    
    return predictions


In [None]:
# Оцениваем модель на тестовой выборке

test_data = pd.read_csv('/kaggle/input/ru_instruct_gpt4_test.tsv', sep='\t')

pt_lora_model.eval()

predictions = []
references = []
batch_size = 2

with torch.no_grad():
    for batch_idx, i in enumerate(tqdm(range(0, len(test_data), batch_size))):
        batch = test_data.iloc[i:i+batch_size]
        
        input_texts = [prepare_input_texts(row) for _, row in batch.iterrows()]
        batch_refs = [str(row['output']) if pd.notna(row['output']) else '' for _, row in batch.iterrows()]
        
        batch_predictions = generate_predictions(
            pt_lora_model,
            tokenizer,
            input_texts,
            prompt_size,
            device,
            max_length=200,
            num_beams=4,
            temperature=0.2,
            top_k=30,
            top_p=0.95,
            do_sample=True
        )
        
        predictions.extend(batch_predictions)
        references.extend(batch_refs)
        
        if (batch_idx + 1) % 5 == 0:
            torch.cuda.empty_cache()

torch.cuda.empty_cache()

references_list = [[ref] for ref in references]

bleu_result = bleu_metric.compute(predictions=predictions, references=references_list)
rouge_result = rouge_metric.compute(predictions=predictions, references=references)
meteor_result = meteor_metric.compute(predictions=predictions, references=references_list)

print(f"BLEU: {bleu_result['bleu']:.4f}")
print(f"ROUGE-1: {rouge_result['rouge1']:.4f}")
print(f"METEOR: {meteor_result['meteor']:.4f}")

task_lora_pt.connect({
    "test_bleu": bleu_result['bleu'],
    "test_rouge1": rouge_result['rouge1'],
    "test_meteor": meteor_result['meteor']
})


In [None]:
# Проверяем примеры генерации после дообучения

lm_text='Придумайте название для книги на основе следующего описания сюжета.\n\nМолодой парень находит портал в параллельный мир, где его жизнь намного хуже, чем она была до этого.'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=4,
    temperature=0.2,
    top_k=30,
    top_p=0.95,
    do_sample=True
        )

print(predictions[0])

In [None]:
lm_text='Придумай мотивирующую цитату.\n\nСпорт,усилия,результат.'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=2,
    temperature=0.2,
    top_k=30,
    top_p=0.95,
    do_sample=True
)

print(predictions[0])

In [None]:
lm_text='Используя указанную структуру, создай рассказ.\n\nЛето, школьница и учитель, встреча'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=2,
    temperature=0.2,
    top_k=30,
    top_p=0.95,
    do_sample=True
)

print(predictions[0])

In [None]:
lm_text='Расскажи историю про медведя'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=4,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

print(predictions[0])

In [None]:
lm_text='Расскажи историю о девочке. В истории должны быть:игры, подружка, счастливая история'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=1,
    temperature=0.9,
    top_k=50,
    top_p=0.9,
    do_sample=True
)

print(predictions[0])

In [None]:
lm_text='Что означает слово "землетрясение".'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=1,
    temperature=0.9,
    top_k=50,
    top_p=0.9,
    do_sample=True
)

print(predictions[0])

In [None]:
lm_text='Придумай мотивирующую цитату, которая поднимет настроение для учебы.'

predictions = generate_predictions(
    pt_lora_model,
    tokenizer,
    [lm_text],
    prompt_size,
    device,
    max_length=200,
    num_beams=1,
    temperature=0.9,
    top_k=50,
    top_p=0.9,
    do_sample=True
)

print(predictions[0])

In [None]:
task_lora_pt.close()