In [1]:
import torch
import uuid
import os
import evaluate
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType # type: ignore
from tqdm import tqdm

In [2]:
train = Dataset.from_parquet('./__output__/matched_train.parquet')
test = Dataset.from_parquet('./__output__/matched_test.parquet')

In [3]:
train

Dataset({
    features: ['nmId', 'text', 'responder', 'type', 'product_name', 'product_category_2', 'product_category_1', 'product_color', 'product_description', 'product_brand', 'vector', 'toxicity', 'emotions', 'target_nmId', 'target_text', 'target_responder', 'target_type', 'target_product_name', 'target_product_category_2', 'target_product_category_1', 'target_product_color', 'target_product_description', 'target_product_brand', 'target_vector', 'target_toxicity', 'target_emotions', '__index_level_0__'],
    num_rows: 15147
})

In [4]:
test

Dataset({
    features: ['nmId', 'text', 'responder', 'type', 'product_name', 'product_category_2', 'product_category_1', 'product_color', 'product_description', 'product_brand', 'vector', 'toxicity', 'emotions', 'target_nmId', 'target_text', 'target_responder', 'target_type', 'target_product_name', 'target_product_category_2', 'target_product_category_1', 'target_product_color', 'target_product_description', 'target_product_brand', 'target_vector', 'target_toxicity', 'target_emotions', '__index_level_0__'],
    num_rows: 1683
})

In [5]:
model = T5ForConditionalGeneration.from_pretrained(
    'ai-forever/ruT5-base',
    device_map='cuda:0',
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(
    'ai-forever/ruT5-base',
    useFast=True
)

coll = DataCollatorForSeq2Seq(model=model, tokenizer=tokenizer)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
sum([p.numel() for p in model.parameters() if p.requires_grad])

222903552

In [7]:
peft_conf = LoraConfig(
    TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    target_modules=['q', 'v'],
    lora_dropout=0.1,
    bias='none',
    inference_mode=False
)

peft_model = get_peft_model(model, peft_conf)

In [8]:
sum([p.numel() for p in peft_model.parameters() if p.requires_grad])

1769472

In [9]:
def preprocess(e):
    out = tokenizer(
        f'''
ответчик: {e['target_responder']}
тип: {e['target_type']}
название: {e['target_product_name']}
категория 2: {e['target_product_category_2']}
цвет: {e['target_product_color']}
бренд: {e['target_product_brand']}
описание: {e['target_product_description']}
токсичность: {e['toxicity']}
эиоциональность: {e['emotions']}
текст: {e['text']}
        ''',
        truncation=True,
        max_length=1400,
        return_tensors='pt'
    )

    out['input_ids'] = out['input_ids'][0] # type: ignore
    out['attention_mask'] = out['attention_mask'][0] # type: ignore

    labels = tokenizer(
        text_target=e['target_text'], 
        max_length=1200,         
        truncation=True,
        return_tensors='pt'
    )
    out['labels'] = labels['input_ids'][0] # type: ignore
    
    return out

In [10]:
train = train.map(preprocess, remove_columns=train.column_names) # type: ignore
test = test.map(preprocess, remove_columns=test.column_names) # type: ignore

train

Map:   0%|          | 0/15147 [00:00<?, ? examples/s]

Map:   0%|          | 0/1683 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 15147
})

In [11]:
checkpoint = str(uuid.uuid4())

checkpoint

'25494ed9-f307-4293-a192-80678a7aca3a'

In [12]:
os.makedirs(f'./models/{checkpoint}')

In [13]:
args = Seq2SeqTrainingArguments(
    output_dir=f'./models/{checkpoint}/runs',
    eval_strategy='epoch',
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=5,
    learning_rate=0.001,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_strategy='no',
    remove_unused_columns=False,
    bf16=True
)

trainer = Seq2SeqTrainer(
    model=peft_model, # type: ignore
    args=args,
    data_collator=coll,
    train_dataset=train, # type: ignore
    eval_dataset=test, # type: ignore
)

In [14]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,4.1544,3.685551
2,3.9747,3.571082
3,3.8785,3.517082
4,3.7937,3.486148
5,3.7601,3.463229


TrainOutput(global_step=15150, training_loss=4.0155459000489895, metrics={'train_runtime': 2968.5451, 'train_samples_per_second': 25.512, 'train_steps_per_second': 5.104, 'total_flos': 4.788360886682419e+16, 'train_loss': 4.0155459000489895, 'epoch': 5.0})

In [15]:
trainer.save_model(f'./models/{checkpoint}/model')

In [16]:
meteor = evaluate.load('evaluate-metric/meteor')
rouge = evaluate.load('evaluate-metric/rouge')
bleu = evaluate.load('evaluate-metric/bleu')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [17]:
targets = []
preds = []
for row in tqdm(test):
    text_input_ids = torch.LongTensor([row['input_ids']]).to('cuda:0')
    text_attention_mask = torch.LongTensor([row['attention_mask']]).to('cuda:0')

    model_out = model.generate(
        input_ids=text_input_ids, 
        attention_mask=text_attention_mask,
        top_k=1,
        do_sample=True
    )[0]

    preds.append(tokenizer.decode(model_out, skip_special_tokens=True))
    targets.append(tokenizer.decode(row['labels'], skip_special_tokens=True))

100%|██████████| 1683/1683 [11:35<00:00,  2.42it/s]


In [18]:
print(f'''
Test metrice:
Meteor: {meteor.compute(predictions=preds, references=targets)['meteor']}
Rouge: {rouge.compute(predictions=preds, references=targets)['rougeLsum']},
Bleu: {bleu.compute(predictions=preds, references=targets)['bleu']}
''')


Test metrice:
Meteor: 0.20426985994927843
Rouge: 0.02542046930974775,
Bleu: 0.011637031268017102

