In [2]:
import torch
import numpy as np
import pytorch_lightning as pl
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split
from transformers import (
    AutoConfig,
    EncoderDecoderConfig,
    EncoderDecoderModel,
    BertTokenizerFast,
)

from models.baseline import Seq2Seq, Encoder, Decoder
from data_utils.dataset import TranslationDataset
from data_utils.lang import read_langs, PAD
from pl_utils.pl_model import ModelWrapper
from pl_utils.pl_dataset import PlTranslationDataset

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TEST_SHARE = 0.2

torch.cuda.empty_cache()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
decoder_path = "ai-forever/ruBert-base"
encoder_path = "bert-base-uncased"


In [4]:
tokenizer = BertTokenizerFast.from_pretrained(encoder_path)
dec_tokenizer = BertTokenizerFast.from_pretrained(decoder_path)

In [4]:
encoder_config = AutoConfig.from_pretrained(encoder_path)
decoder_config = AutoConfig.from_pretrained(decoder_path)

model_config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = EncoderDecoderModel(model_config) # .to(DEVICE)

model.config.decoder_start_token_id = dec_tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# DATA

In [6]:
from datasets import Dataset
import pandas as pd

In [7]:
# with open("data.txt", 'r') as flines:
#     all_lines = np.array(flines.readlines())

_, _, pairs = read_langs("en", "ru", "data.txt", False)

test_size = int(TEST_SHARE * len(pairs))
train_size = len(pairs) - test_size

train_pairs, val_pairs = train_test_split(pairs, test_size=TEST_SHARE, random_state=42)
val_pairs, test_pairs = train_test_split(val_pairs, test_size=TEST_SHARE, random_state=42)


In [8]:
train_dataset = Dataset.from_pandas(pd.DataFrame(data=train_pairs))
val_dataset = Dataset.from_pandas(pd.DataFrame(data=val_pairs))
test_dataset = Dataset.from_pandas(pd.DataFrame(data=test_pairs))


In [9]:
source_lang = "en"
target_lang = "ru"

def preprocess_function(
        pairs: Dataset,
        source_lang: str = "en",
        target_lang: str = "ru",
        enc_tokenizer=tokenizer,
        dec_tokenizer=dec_tokenizer,
    ):
    inputs = pairs[source_lang]
    targets = pairs[target_lang]

    max_input_length = int(np.percentile([len(s) for s in inputs], 95))
    max_target_length = int(np.percentile([len(s) for s in targets], 95))
    max_length = max(max_input_length, max_target_length)

    model_inputs = enc_tokenizer(
        inputs,
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    labels = dec_tokenizer(
        text_target=targets,
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["attention_mask"] = labels["attention_mask"]

    return model_inputs

In [14]:
return_cols = ['input_ids', 'attention_mask', 'labels']

tok_train_dataset = train_dataset.map(preprocess_function, batched=True)
tok_train_dataset.set_format(columns=return_cols)
tok_val_dataset = val_dataset.map(preprocess_function, batched=True)
tok_val_dataset.set_format(columns=return_cols)
tok_test_dataset = test_dataset.map(preprocess_function, batched=True)
tok_test_dataset.set_format(type="torch", columns=return_cols)


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# TRAIN

In [27]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from nltk.translate.bleu_score import corpus_bleu
import evaluate

metric = evaluate.load("sacrebleu")
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
)

In [11]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = dec_tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, dec_tokenizer.pad_token_id)
    decoded_labels = dec_tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    try:
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        result = {"bleu_hf": result["score"]}
    except ZeroDivisionError:
        result = {"bleu_hf": 0.}

    result["bleu_nltk"] = corpus_bleu(decoded_labels, decoded_preds)

    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [15]:
batch_size = 4
model_name = "bert2bert"

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "steps",
    eval_steps=1500,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    gradient_accumulation_steps=2,
    fp16=True,
)

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tok_train_dataset,
    eval_dataset=tok_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu Hf,Bleu Nltk
1500,0.5493,0.52195,4.9741,0.1891
3000,0.4638,0.450316,6.398,0.2531
4500,0.4291,0.416468,7.0095,0.2651
6000,0.3956,0.393089,7.7815,0.2846
7500,0.3853,0.378003,8.6451,0.3073
9000,0.366,0.367079,9.45,0.3256
10500,0.3478,0.357557,10.0856,0.33
12000,0.346,0.351553,10.4758,0.338
13500,0.3395,0.347449,10.706,0.3396
15000,0.3368,0.345117,10.9664,0.3404








TrainOutput(global_step=15000, training_loss=0.5782082529703776, metrics={'train_runtime': 6384.5936, 'train_samples_per_second': 18.795, 'train_steps_per_second': 2.349, 'total_flos': 2.689997540270904e+16, 'train_loss': 0.5782082529703776, 'epoch': 3.0})

In [5]:
checkpoint = "bert2bert-finetuned-en-to-ru/checkpoint-15000"
model = EncoderDecoderModel.from_pretrained(checkpoint)


In [73]:
from tqdm.notebook import tqdm

def best_worst(model, dataset):
    best_ex = []
    worst_ex = []
    
    for i, target in tqdm(enumerate(dataset), total=len(dataset)):
        if i == 100:
            break
        kwargs = dataset[i:i+1]
        trans = model.generate(**kwargs).squeeze()

        translated = dec_tokenizer.decode(trans, skip_special_tokens=True)
        target_sent = dec_tokenizer.decode(target['labels'], skip_special_tokens=True)
        en_sent = tokenizer.decode(target["input_ids"], skip_special_tokens=True)
        score = corpus_bleu([[translated]], [target_sent])
        
        # считаем по бейзоайну
        if score > 0.3:
            best_ex.append([en_sent, target_sent, translated, score])
        
        if score < 0.1:
            worst_ex.append([en_sent, target_sent, translated, score])
    
    return best_ex, worst_ex
            

In [74]:
results = best_worst(model, tok_test_dataset)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [75]:
results[0][:5]

[['featuring a hairdryer, the private bathroom also comes with towels and free toiletries.',
  'также в распоряжении гостеи собственная ванная комната с феном, полотенцами и бесплатными туалетно - косметическими принадлежностями.',
  'в собственнои ваннои комнате с душем предоставляются бесплатные туалетно - косметические принадлежности и бесплатные',
  0.5589788093099456],
 ['all rooms come with clean bedding.',
  'все номера укомплектованы свежим постельным бельем.',
  'все номера оснащены телевизором с кабельными каналами.',
  0.40769919888479106],
 ['it features free wi - fi and a furnished terrace with swimming pool.',
  'к услугам гостеи бесплатныи wifi и меблированная терраса с бассеином.',
  'к услугам гостеи открытыи бассеин, терраса и бесплатныи wi -',
  0.6572218598665456],
 ['there is a dining area and a kitchen.',
  'в числе удобств — обеденная зона и кухня.',
  'в числе удобств — обеденная зона и кухня.',
  1.0],
 ['a full breakfast buffet is served every morning at the c

In [76]:
results[1][:5]

[['the hotel is conveniently situated to reach paris bercy stadium ( popb ), vincennes castle and zoo, horse tracks and disneyland resort paris.',
  'отель удобно расположен для достижения стадиона paris bercy ( popb ), замка винсенс и зоопарка, конных треков и парижского диснеиленда.',
  'отель находится в городе сан - де - де - де - де - де - ла -',
  0.07602253426229971],
 ['the famously narrow punkaharju ridge is 45 minutes ’ drive from pajarinhovi, while the russian border at niirala is a 1 - hour journey.',
  'известныи узкии хребет пункахарью расположен в 45 минутах езды от отеля pajarinhovi, а контрольно - пропускнои пункт ниирала на границе с россиеи - в 1 часе езды.',
  'отель типа « постель и завтрак » la la la la la las находится в 1,',
  0.062415559546089294],
 ['guest house zolotaya rybka is located in olginka, 48 km from lazarevskoye and 18 km from tuapse.',
  'гостевои дом « золотая рыбка » находится в селе ольгинка, в 48 км от микрораиона лазаревское и в 18 км от туапс