Здесь тестируется не Seq2Seq, а Decoder-Only модель, из-за чего таргет так-же включает в себя промпт

In [1]:
import torch
import uuid
import os
import evaluate
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType # type: ignore
from tqdm import tqdm

In [2]:
train = Dataset.from_parquet('./__output__/matched_train.parquet')
test = Dataset.from_parquet('./__output__/matched_test.parquet')

In [3]:
train

Dataset({
    features: ['nmId', 'text', 'responder', 'type', 'product_name', 'product_category_2', 'product_category_1', 'product_color', 'product_description', 'product_brand', 'vector', 'toxicity', 'emotions', 'target_nmId', 'target_text', 'target_responder', 'target_type', 'target_product_name', 'target_product_category_2', 'target_product_category_1', 'target_product_color', 'target_product_description', 'target_product_brand', 'target_vector', 'target_toxicity', 'target_emotions', '__index_level_0__'],
    num_rows: 15147
})

In [4]:
test

Dataset({
    features: ['nmId', 'text', 'responder', 'type', 'product_name', 'product_category_2', 'product_category_1', 'product_color', 'product_description', 'product_brand', 'vector', 'toxicity', 'emotions', 'target_nmId', 'target_text', 'target_responder', 'target_type', 'target_product_name', 'target_product_category_2', 'target_product_category_1', 'target_product_color', 'target_product_description', 'target_product_brand', 'target_vector', 'target_toxicity', 'target_emotions', '__index_level_0__'],
    num_rows: 1683
})

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2.5-0.5B',
    device_map='cuda:0',
    torch_dtype=torch.bfloat16,
    attn_implementation='flash_attention_2'
)
tokenizer = AutoTokenizer.from_pretrained(
    'Qwen/Qwen2.5-0.5B',
    useFast=True
)

coll = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [6]:
sum([p.numel() for p in model.parameters() if p.requires_grad])

494032768

In [7]:
peft_conf = LoraConfig(
    TaskType.SEQ_2_SEQ_LM,
    r=12,
    lora_alpha=32,
    lora_dropout=0.1,
    inference_mode=False
)

peft_model = get_peft_model(model, peft_conf)

In [8]:
sum([p.numel() for p in peft_model.parameters() if p.requires_grad])

811008

In [9]:
max([len(tokenizer.tokenize(row['text'])) for row in train]) # type: ignore

564

In [10]:
max([len(tokenizer.tokenize(row['target_text'])) for row in train]) # type: ignore

523

In [11]:
def preprocess(e):
    out = tokenizer(
        f'''
ответчик: {e['target_responder']}
тип: {e['target_type']}
название: {e['target_product_name']}
категория 2: {e['target_product_category_2']}
цвет: {e['target_product_color']}
бренд: {e['target_product_brand']}
описание: {e['target_product_description']}
токсичность: {e['toxicity']}
эиоциональность: {e['emotions']}
текст: {e['text']}
ответ:
        ''',
        truncation=True,
        padding='max_length',
        max_length=1000,
        return_tensors='pt'
    )

    out['input_ids'] = out['input_ids'][0] # type: ignore
    out['attention_mask'] = out['attention_mask'][0] # type: ignore

    labels = tokenizer(
         f'''
ответчик: {e['target_responder']}
тип: {e['target_type']}
название: {e['target_product_name']}
категория 2: {e['target_product_category_2']}
цвет: {e['target_product_color']}
бренд: {e['target_product_brand']}
описание: {e['target_product_description']}
токсичность: {e['toxicity']}
эиоциональность: {e['emotions']}
текст: {e['text']}
ответ:
{e['target_text']}
        ''',
        text_target=e['target_text'], 
        max_length=1500,         
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    out['labels'] = labels['input_ids'][0] # type: ignore
    
    return out

In [12]:
train = train.map(preprocess, remove_columns=train.column_names) # type: ignore
test = test.map(preprocess, remove_columns=test.column_names) # type: ignore

train

Map:   0%|          | 0/15147 [00:00<?, ? examples/s]

Map:   0%|          | 0/1683 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 15147
})

In [13]:
checkpoint = str(uuid.uuid4())

checkpoint

'3fd3a3fb-f114-485d-8611-8ae6f233f1c2'

In [14]:
os.makedirs(f'./models/{checkpoint}')

In [15]:
args = TrainingArguments(
    output_dir=f'./models/{checkpoint}/runs',
    eval_strategy='epoch',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    learning_rate=0.001,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_strategy='no',
    remove_unused_columns=False,
    bf16=True
)

trainer = Trainer(
    model=peft_model, # type: ignore
    args=args,
    data_collator=coll,
    train_dataset=train, # type: ignore
    eval_dataset=test, # type: ignore
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.9605,1.95567
2,2.1368,3.470997
3,3.7423,3.605571
4,3.563,3.470597
5,3.5359,3.412648


TrainOutput(global_step=37870, training_loss=3.012409730180449, metrics={'train_runtime': 8430.678, 'train_samples_per_second': 8.983, 'train_steps_per_second': 4.492, 'total_flos': 1.630010112192e+17, 'train_loss': 3.012409730180449, 'epoch': 5.0})

In [17]:
trainer.save_model(f'./models/{checkpoint}/model')

In [18]:
meteor = evaluate.load('evaluate-metric/meteor')
rouge = evaluate.load('evaluate-metric/rouge')
bleu = evaluate.load('evaluate-metric/bleu')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Из-за долгого инференса, реимпортирую модель, но теперь с flash_attention2

In [19]:
targets = []
preds = []
for row in tqdm(test.batch(20)): # type: ignore
    text_input_ids = torch.LongTensor(row['input_ids']).to('cuda:0') # type: ignore
    text_attention_mask = torch.LongTensor(row['attention_mask']).to('cuda:0') # type: ignore

    model_out = model.generate(
        input_ids=text_input_ids, 
        attention_mask=text_attention_mask,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        max_new_tokens=300,
        top_k=1,
        do_sample=True
    )

    labels_decoded = tokenizer.batch_decode(
        row['labels'], # type: ignore
        skip_special_tokens=True
    )
    preds_decoded = tokenizer.batch_decode(
        model_out,
        skip_special_tokens=True
    )

    for i in range(len(labels_decoded)):
        if 'ответ:' in labels_decoded[i] and 'ответ:' in preds_decoded[i]:
            preds.append(preds_decoded[i].split('ответ:')[1])
            targets.append(labels_decoded[i].split('ответ:')[1])

Batching examples:   0%|          | 0/1683 [00:00<?, ? examples/s]

  0%|          | 0/85 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  1%|          | 1/85 [00:17<25:11, 17.99s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▏         | 2/85 [00:34<23:37, 17.08s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  4%|▎         | 3/85 [00:50<22:55, 16.78s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  5%|▍         | 4/85 [01:06<22:17, 16.51s/it]A decoder-only architecture is being used, but right-padding was detected! For correct

Модель не может показать на указанном датасете устойчивую сходимость, а так же имеет худшую оценку по метрикам

In [20]:
print(f'''
Test metrice:
Meteor: {meteor.compute(predictions=preds, references=targets)['meteor']}
Rouge: {rouge.compute(predictions=preds, references=targets)['rougeLsum']},
Bleu: {bleu.compute(predictions=preds, references=targets)['bleu']}
''')


Test metrice:
Meteor: 0.014381068980355698
Rouge: 0.0,
Bleu: 0.0

