In [18]:
import torch
import uuid
import os
import evaluate
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model, TaskType # type: ignore
from tqdm import tqdm

In [19]:
train = Dataset.from_parquet('./__output__/matched_train.parquet')
test = Dataset.from_parquet('./__output__/matched_test.parquet')

In [20]:
train

Dataset({
    features: ['nmId', 'text', 'responder', 'type', 'product_name', 'product_category_2', 'product_category_1', 'product_color', 'product_description', 'product_brand', 'vector', 'toxicity', 'emotions', 'target_nmId', 'target_text', 'target_responder', 'target_type', 'target_product_name', 'target_product_category_2', 'target_product_category_1', 'target_product_color', 'target_product_description', 'target_product_brand', 'target_vector', 'target_toxicity', 'target_emotions', '__index_level_0__'],
    num_rows: 15147
})

In [21]:
test

Dataset({
    features: ['nmId', 'text', 'responder', 'type', 'product_name', 'product_category_2', 'product_category_1', 'product_color', 'product_description', 'product_brand', 'vector', 'toxicity', 'emotions', 'target_nmId', 'target_text', 'target_responder', 'target_type', 'target_product_name', 'target_product_category_2', 'target_product_category_1', 'target_product_color', 'target_product_description', 'target_product_brand', 'target_vector', 'target_toxicity', 'target_emotions', '__index_level_0__'],
    num_rows: 1683
})

In [22]:
model = T5ForConditionalGeneration.from_pretrained(
    'google/mt5-small',
    device_map='cuda:0',
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(
    'google/mt5-small'
)

coll = DataCollatorForSeq2Seq(model=model, tokenizer=tokenizer)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [23]:
sum([p.numel() for p in model.parameters() if p.requires_grad])

300176768

In [24]:
peft_conf = LoraConfig(
    TaskType.SEQ_2_SEQ_LM,
    r=32,
    lora_alpha=32,
    target_modules=['q', 'v'],
    lora_dropout=0.1,
    bias='none',
    inference_mode=False
)

peft_model = get_peft_model(model, peft_conf)

In [25]:
sum([p.numel() for p in peft_model.parameters() if p.requires_grad])

1376256

In [26]:
max([len(tokenizer.tokenize(row['text'])) for row in train]) # type: ignore

506

In [27]:
max([len(tokenizer.tokenize(row['target_text'])) for row in train]) # type: ignore

484

In [28]:
tokenizer.model_max_length

1000000000000000019884624838656

In [29]:
def preprocess(e):
    out = tokenizer(
        f'''
responder: {e['target_responder']}
type: {e['target_type']}
product_name: {e['target_product_name']}
product_category_2: {e['target_product_category_2']}
product_color: {e['target_product_color']}
product_brand: {e['target_product_brand']}
product_description: {e['target_product_description']}
toxicity: {e['toxicity']}
emotions: {e['emotions']}
text: {e['text']}
        ''',
        truncation=True,
        max_length=1500,
        return_tensors='pt'
    )

    out['input_ids'] = out['input_ids'][0] # type: ignore
    out['attention_mask'] = out['attention_mask'][0] # type: ignore

    labels = tokenizer(
        text_target=e['target_text'],
        truncation=True,
        max_length=1500,
        return_tensors='pt'
    )
    out['labels'] = labels['input_ids'][0] # type: ignore
    
    return out

In [30]:
train = train.map(preprocess, remove_columns=train.column_names) # type: ignore
test = test.map(preprocess, remove_columns=test.column_names) # type: ignore

train

Map:   0%|          | 0/1683 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 15147
})

In [31]:
checkpoint = str(uuid.uuid4())

checkpoint

'e7ceaf61-8dbf-4c7e-a7a8-cdec2659cec6'

In [32]:
os.makedirs(f'./models/{checkpoint}')

In [33]:
args = Seq2SeqTrainingArguments(
    output_dir=f'./models/{checkpoint}/runs',
    eval_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    learning_rate=0.001,
    warmup_ratio=0.1,
    weight_decay=0.01,
    save_strategy='no',
    remove_unused_columns=False,
    bf16=True,
)

trainer = Seq2SeqTrainer(
    model=peft_model, # type: ignore
    args=args,
    data_collator=coll,
    train_dataset=train, # type: ignore
    eval_dataset=test, # type: ignore
)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,4.5421,3.762706
2,4.3146,3.626596
3,4.1856,3.565572
4,4.1173,3.521024
5,4.0604,3.488883


TrainOutput(global_step=18935, training_loss=4.539518489187929, metrics={'train_runtime': 2392.5005, 'train_samples_per_second': 31.655, 'train_steps_per_second': 7.914, 'total_flos': 4.877117972780851e+16, 'train_loss': 4.539518489187929, 'epoch': 5.0})

In [35]:
trainer.save_model(f'./models/{checkpoint}/model')

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [36]:
meteor = evaluate.load('evaluate-metric/meteor')
rouge = evaluate.load('evaluate-metric/rouge')
bleu = evaluate.load('evaluate-metric/bleu')

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [37]:
targets = []
preds = []
for row in tqdm(test):
    text_input_ids = torch.LongTensor([row['input_ids']]).to('cuda:0')
    text_attention_mask = torch.LongTensor([row['attention_mask']]).to('cuda:0')

    model_out = model.generate(
        input_ids=text_input_ids, 
        attention_mask=text_attention_mask,
        top_k=1,
        do_sample=True
    )[0]

    preds.append(tokenizer.decode(model_out, skip_special_tokens=True))
    targets.append(tokenizer.decode(row['labels'], skip_special_tokens=True))

100%|██████████| 1683/1683 [09:11<00:00,  3.05it/s]


In [38]:
print(f'''
Test metrice:
Meteor: {meteor.compute(predictions=preds, references=targets)['meteor']}
Rouge: {rouge.compute(predictions=preds, references=targets)['rougeLsum']},
Bleu: {bleu.compute(predictions=preds, references=targets)['bleu']}
''')


Test metrice:
Meteor: 0.16351780489796688
Rouge: 0.01674501307703882,
Bleu: 0.00453352544742292

