In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)
print()

Using device: cuda



In [2]:
model_name = "cointegrated/rut5-small-chitchat"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [3]:
from transformers import TextDataset, DataCollatorForLanguageModeling

train_path = 'dataset/abatur_train.csv'

# –°–æ–∑–¥–∞–Ω–∏–µ –¥–∞—Ç–∞—Å–µ—Ç–∞
train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=64)

# –°–æ–∑–¥–∞–Ω–∏–µ –¥–∞—Ç–∞–ª–æ–¥–µ—Ä–∞ (–Ω–∞—Ä–µ–∑–∞–µ—Ç —Ç–µ–∫—Å—Ç –Ω–∞ –æ–ø—Ç–∏–º–∞–ª—å–Ω—ã–µ –ø–æ –¥–ª–∏–Ω–µ –∫—É—Å–∫–∏)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [4]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2000, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=10,# number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=16, # to make "virtual" batch size larger
    )



trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [None]:
trainer.train()
trainer.save_model("./final_save")

In [31]:
text = '–í —á–µ–º –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è —Ç–≤–æ—è —Ä–∞–±–æ—Ç–∞ ?'
inputs = tokenizer(text, return_tensors='pt').to(DEVICE)
with torch.no_grad():
    gen = model.generate(
        **inputs, 
        do_sample=True, top_p=0.5, num_return_sequences=1, 
        repetition_penalty=2.5,
        max_length=64,
    )
for h in gen:
    print(tokenizer.decode(h, skip_special_tokens=True))

–†–∞–±–æ—Ç–∞ –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤ —Ç–æ–º, —á—Ç–æ —Ç—ã –∏–º–µ–µ—à—å –ø—Ä–∞–≤–æ –Ω–∞ —Å–≤–æ—é –∂–∏–∑–Ω—å.


In [39]:
text = '–í —á–µ–º –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è —Ç–≤–æ—è —Ä–∞–±–æ—Ç–∞ ?'
inputs = tokenizer(text, return_tensors='pt').to(DEVICE)
with torch.no_grad():
    hypotheses = model.generate(
        **inputs, 
        do_sample=True, temperature=1.1,
                     max_length=64)
    
for h in hypotheses:
    print(tokenizer.decode(h, skip_special_tokens=True))

–†–∞–±–æ—Ç–∞ –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤ —ç—Ç–æ–º. –í—Å–µ–ª–µ–Ω–Ω–æ–π


In [35]:
# –°—ç–º–ø–ª–∏—Ä–æ–≤–∞–Ω–∏–µ —Å —Ç–µ–º–ø–µ—Ä–∞—Ç—É—Ä–æ–π
text = "–í —á–µ–º –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è —Ç–≤–æ—è —Ä–∞–±–æ—Ç–∞ ? "
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
model.eval()
with torch.no_grad():
    out = model.generate(input_ids, 
                     do_sample=True, 
                     temperature=1.1,
                     max_length=64) 
                     
# 1.4
# 1.6
generated_text = list(map(tokenizer.decode, out))[0]

print(generated_text)

<pad> –í —Ç–æ–º, —á—Ç–æ–±—ã —è –Ω–∞—á–∞–ª –±—ã —Å —ç—Ç–∏–º. –†–∞–±–æ—Ç–∞, –∫–æ—Ç–æ—Ä–∞—è –∑–∞–∫–ª—é—á–∞–µ—Ç—Å—è –≤ –Ω–∏—Ö.</s>
