# Loading rut5 model

In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [15]:
# !pip install sentencepiece

In [16]:
from_pkl_file = True

tokenizer_file = 't5_tokenizer.pkl'
model_file = 't5_model.pkl'

In [17]:
import pickle

user_prefix = '<u>'
bot_prefix = '<b>'

if from_pkl_file:
    with open(tokenizer_file, 'rb') as f:
        tokenizer = pickle.load(f)
else:
    tokenizer = T5Tokenizer.from_pretrained(
        'cointegrated/rut5-small-chitchat2'
    )

    with open(tokenizer_file, 'wb') as f:
        pickle.dump(tokenizer, f)


In [18]:
model: T5ForConditionalGeneration

In [19]:
if from_pkl_file:
    with open(model_file, 'rb') as f:
        model = pickle.load(f)
else:
    model = T5ForConditionalGeneration.from_pretrained('cointegrated/rut5-small-chitchat2')
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)

# Test run

In [7]:
text = 'Привет! Расскажи, как твои дела?'
inputs = tokenizer(text, return_tensors='pt')

In [8]:
with torch.no_grad():
    # https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
    hypotheses = model.generate(
        **inputs,
        temperature=0.9,
        do_sample=True,  # sampling or greedy decoding 
        # (at each decoding step selects the token with the highest prob without considering the impact on future tokens)
        top_p=0.7,
        num_return_sequences=3,
        repetition_penalty=2.5,  # https://arxiv.org/pdf/1909.05858.pdf
        max_length=32,
    )

In [9]:
for h in hypotheses:
    print(tokenizer.decode(h, skip_special_tokens=True))

Как?
У меня ничего нет.
Что это?


# Fine-Tuning

In [5]:
from hw3.data_preparer import load_matreshka_fort5

df_train, df_val = load_matreshka_fort5()

In [11]:
print(df_train.columns)

Index(['source_text', 'target_text'], dtype='object')


In [12]:
def tokenize_data(data, column_name, max_length=64):
    source_texts = data[column_name].tolist()
    tokens = tokenizer.batch_encode_plus(
        source_texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return tokens

In [13]:
train_tokens = tokenize_data(df_train, column_name='source_text')
train_target_tokens = tokenize_data(df_train, column_name='target_text')

val_tokens = tokenize_data(df_val, column_name='source_text')
val_target_tokens = tokenize_data(df_val, column_name='target_text')

In [14]:
from torch.utils.data import TensorDataset

train_dataset = TensorDataset(
    train_tokens['input_ids'],
    train_tokens['attention_mask'],
    train_target_tokens['input_ids'],
    train_target_tokens['attention_mask'],
)

val_dataset = TensorDataset(
    val_tokens['input_ids'],
    val_tokens['attention_mask'],
    val_target_tokens['input_ids'],
    val_target_tokens['attention_mask'],
)

In [15]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [16]:
device = 'mps' if torch.backends.mps.is_built() else 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

mps


In [17]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(20100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(20100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo):

In [18]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [28]:
from torch.utils.data import DataLoader
from tqdm import tqdm


def train_epoch(
        model,
        loader: DataLoader,
        epoch,
        num_epochs,
        optimizer,
        mode,
):
    total_loss = 0

    for x_input_ids, x_attention_mask, y_input_ids, y_attention_mask in tqdm(loader,
                                                                             desc=f'{mode} epoch {epoch}/{num_epochs}...'):
        if mode == 'Training':
            optimizer.zero_grad()

        x_input_ids = x_input_ids.to(device)
        x_attention_mask = x_attention_mask.to(device)
        y_input_ids = y_input_ids.to(device)
        y_attention_mask = y_attention_mask.to(device)

        outputs = model(
            input_ids=x_input_ids,
            attention_mask=x_attention_mask,
            labels=y_input_ids,
            decoder_attention_mask=y_attention_mask,
            return_dict=True,
        )
        outputs_loss = outputs.loss
        total_loss += outputs_loss.item()

        if mode == 'Training':
            outputs_loss.backward()
            optimizer.step()

    loss = total_loss / len(loader)
    print(f'{mode} epoch {epoch + 1}/{num_epochs}: {mode} Loss: {loss:.4f}')

In [20]:
def train(
        model,
        train_loader,
        val_loader,
        optimizer,
        num_epochs=5,
):
    for epoch in range(num_epochs):
        model.train()

        train_epoch(model, train_loader, epoch, num_epochs, optimizer, mode='Training')

        model.eval()

        with torch.no_grad():
            train_epoch(model, val_loader, epoch, num_epochs, optimizer, mode='Validating')

In [21]:
train(model, train_loader, val_loader, optimizer)

Training epoch 0/5...: 100%|██████████| 1858/1858 [06:58<00:00,  4.44it/s]


Training epoch 1/5: Training Loss: 3.3130


Validating epoch 0/5...: 100%|██████████| 462/462 [00:19<00:00, 23.42it/s]


Validating epoch 1/5: Validating Loss: 1.0373


Training epoch 1/5...: 100%|██████████| 1858/1858 [06:48<00:00,  4.55it/s]


Training epoch 2/5: Training Loss: 1.1510


Validating epoch 1/5...: 100%|██████████| 462/462 [00:19<00:00, 23.56it/s]


Validating epoch 2/5: Validating Loss: 0.9610


Training epoch 2/5...: 100%|██████████| 1858/1858 [06:48<00:00,  4.55it/s]


Training epoch 3/5: Training Loss: 1.0823


Validating epoch 2/5...: 100%|██████████| 462/462 [00:19<00:00, 23.74it/s]


Validating epoch 3/5: Validating Loss: 0.9284


Training epoch 3/5...: 100%|██████████| 1858/1858 [06:48<00:00,  4.54it/s]


Training epoch 4/5: Training Loss: 1.0429


Validating epoch 3/5...: 100%|██████████| 462/462 [00:19<00:00, 23.70it/s]


Validating epoch 4/5: Validating Loss: 0.9078


Training epoch 4/5...: 100%|██████████| 1858/1858 [06:49<00:00,  4.54it/s]


Training epoch 5/5: Training Loss: 1.0185


Validating epoch 4/5...: 100%|██████████| 462/462 [00:19<00:00, 23.70it/s]

Validating epoch 5/5: Validating Loss: 0.8901





In [22]:
torch.save(model.state_dict(), 'finetuned_t5.pth')

# Dialogue loop

In [30]:
def generate(
        model: T5ForConditionalGeneration,
        inputs,
        num_return_sequences=5,
):
    return model.generate(
        **inputs,
        temperature=0.9,
        do_sample=True,
        top_p=0.7,
        num_return_sequences=num_return_sequences,
        repetition_penalty=2.5,
        max_length=64,
    )

In [56]:
print(len(tokenizer.get_vocab()))

20200


In [57]:
model.state_dict()['shared.weight'].shape

torch.Size([20100, 512])

In [59]:
import time

from data_preparer import ANSWER_PREFIX

model.to('cpu')
model.eval()

user_input = input('Talk to bot<3')
history = ''
verbose_history = []

while user_input != 'q':
    user_input = user_prefix + user_input
    history += user_input
    inputs = tokenizer(ANSWER_PREFIX + history, return_tensors='pt')
    with torch.no_grad():
        hypotheses = generate(model, inputs)
        bot_utterances = []
        for h in hypotheses:
            bot_utterances.append(tokenizer.decode(h, skip_special_tokens=True))
        history += bot_prefix + bot_utterances[0]
        print(bot_utterances)
        verbose_history.append((user_input, bot_prefix + bot_utterances[0]))
    time.sleep(1)
    user_input = input('Talk to bot<3')

model.to(device)

print('history:', history)
print('verbose history:')
for user, bot in verbose_history:
    print('--' + user.removeprefix(user_prefix))
    print('--' + bot.removeprefix(bot_prefix))

['Два часа.', 'Да, сегодня 20 минут.', 'Да, сегодня днем.', 'Сегодня пятница.', 'Сегодня вторник.']
['Сегодня в отпуске.', 'Это так. Я не могу этого рассказать, но ведь все равно на самом деле мне нужно знать об этом', 'Я живу в Москве.', 'Это не так, что ты можешь мне помочь?', 'Я живу в центре города.']
['Это город Севастополь, но сейчас на море.', 'Хорошо, в том городе.', 'Да, я тоже недавно был дома. Я могу быть в отпуске с друзьями', 'Да, я недавно был в Москве. А ты знаешь ли мы на каком-нибудь городе?', 'Я слышал, что это так.']
['Да, это правда. А ты знаешь ли вы что-то новое?', 'Да, я в Москве. А ты знаешь ли вы?', 'Я знаю, что город Севастополь был заканчиваться на 11 часов.', 'Да, я думаю что это город сейчас в Питере.', 'Это действительно интересно. В России есть всегда много мест, чтобы в нем были проблемы со скоростью 7 минут!']
['Интересно, какой город?', 'Это неверно. Я тоже хотел бы сказать, что этот город действительно интересно', 'Да, я не думаю об этой истории.', 'М

# Considering BLEU score(?!)

## Usage example

In [2]:
from torchmetrics.text import BLEUScore

# A BLEU score of 0.4 or higher is generally considered good (answer from GPT)

preds = ['the cat is on the mat']
target = [['there is a cat on the mat', 'a cat is on the mat']]
bleu = BLEUScore()
bleu(preds, target)

tensor(0.7598)

In [6]:
print(df_val)

                                            source_text  \
0     ответь: Но как же все, что было раньше? Мы же ...   
1     ответь: Ой, было ужасно, я проспал все выходны...   
2        ответь: Да вечеринка была и я уснул на диване.   
3     ответь: Привет, я наконец-то вернулся после до...   
4     ответь: Я побывал во многих странах, в том чис...   
...                                                 ...   
3691                                    ответь: Привет!   
3692    ответь: Я хочу узнать факты о пандемии COVID-19   
3693                ответь: Вот это да, а факт номер 2?   
3694    ответь: В России насчитывается более 19000 озер   
3695  ответь: В России находится самая высокая гора ...   

                                            target_text  
0     Да, но это может быть лучше, чем продолжать жи...  
1                                Что же ты так проспал?  
2     Надеюсь, тебе не пришлось что-то важное пропус...  
3             О, привет! Какое путешествие ты проходил?  
4

In [25]:
model.load_state_dict(torch.load('finetuned_t5.pth'))
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(20100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(20100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo):

In [73]:
import re

string = 'Е е е? еее.еее'
string_splitted = re.split('\.|\?', string)
print(string_splitted)

['Е е е', ' еее', 'еее']


# Evaluating BLEU on val dataset

In [77]:
import re
from torchmetrics.text import BLEUScore

bleu = BLEUScore(n_gram=1)
avg_bleu_score = 0.
cnt = 100
with torch.no_grad():
    for _, row in tqdm(df_val.head(cnt).iterrows(), total=cnt):
        source = row['source_text']
        target = row['target_text']

        inputs = tokenizer(source, return_tensors='pt')
        hypotheses = generate(model, inputs, num_return_sequences=1)
        pred = tokenizer.decode(hypotheses[0], skip_special_tokens=True)

        shrinked_target = re.split('\.|\?', target)[0]
        shrinked_pred = re.split('\.|\?', pred)[0]
        
        metric = bleu(shrinked_pred, shrinked_target)
        avg_bleu_score += metric.item()
print('avg bleu:', avg_bleu_score * 1. / cnt)

100%|██████████| 100/100 [00:11<00:00,  8.73it/s]

avg bleu: 0.13365166932344436





# Considering METEOR metric

## Usage example

In [80]:
import evaluate

meteor = evaluate.load('meteor')
predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
reference = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
results = meteor.compute(predictions=predictions, references=reference)
print(round(results['meteor'], 2))

1.0


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/timoniche/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/timoniche/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/timoniche/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Evaluating METEOR on val dataset

In [88]:
avg_meteor_score = 0.
initial_meteor_score = 0.

cnt = 100
initial_model = T5ForConditionalGeneration.from_pretrained('cointegrated/rut5-small-chitchat2')
with torch.no_grad():
    for _, row in tqdm(df_val.head(cnt).iterrows(), total=cnt):
        source = row['source_text']
        target = row['target_text']

        inputs = tokenizer(source, return_tensors='pt')
        hypotheses = generate(model, inputs, num_return_sequences=1)
        not_fine_tuned_hypotheses = generate(initial_model, inputs, num_return_sequences=1)
        
        pred = tokenizer.decode(hypotheses[0], skip_special_tokens=True)
        not_fine_tuned_preds = tokenizer.decode(not_fine_tuned_hypotheses[0], skip_special_tokens=True)
        
        shrinked_target = re.split('\.|\?', target)[0]
        shrinked_pred = re.split('\.|\?', pred)[0]
        shinked_initial_preds = re.split('\.|\?', not_fine_tuned_preds)[0]
        
        metric = meteor.compute(predictions=[shrinked_pred], references=[shrinked_target])
        initial_metric = meteor.compute(predictions=[shinked_initial_preds], references=[shrinked_target])
        
        avg_meteor_score += metric['meteor']
        initial_meteor_score += initial_metric['meteor']
        
print('avg meteor:', avg_meteor_score * 1. / cnt)
print('avg initial meteor:', initial_meteor_score * 1. / cnt)

100%|██████████| 100/100 [00:17<00:00,  5.56it/s]

avg meteor: 0.14073682317052313
avg initial meteor: 0.03196982772392597



