In [1]:
import torch
import numpy as np
import pytorch_lightning as pl
from torch import nn
from torch import optim
from sklearn.model_selection import train_test_split
from transformers import (
    AutoConfig,
    EncoderDecoderConfig,
    EncoderDecoderModel,
    BertTokenizerFast,
)

from models.baseline import Seq2Seq, Encoder, Decoder
from data_utils.dataset import TranslationDataset
from data_utils.lang import read_langs, PAD
from pl_utils.pl_model import ModelWrapper
from pl_utils.pl_dataset import PlTranslationDataset

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TEST_SHARE = 0.2

torch.cuda.empty_cache()

%load_ext autoreload
%autoreload 2

In [2]:
decoder_path = "ai-forever/ruBert-base"
encoder_path = "bert-base-uncased"


In [3]:
tokenizer = BertTokenizerFast.from_pretrained(encoder_path)
dec_tokenizer = BertTokenizerFast.from_pretrained(decoder_path)

In [4]:
encoder_config = AutoConfig.from_pretrained(encoder_path)
decoder_config = AutoConfig.from_pretrained(decoder_path)

model_config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = EncoderDecoderModel(model_config) # .to(DEVICE)

model.config.decoder_start_token_id = dec_tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# DATA

In [5]:
from datasets import Dataset
import pandas as pd

In [6]:
# with open("data.txt", 'r') as flines:
#     all_lines = np.array(flines.readlines())

_, _, pairs = read_langs("en", "ru", "data.txt", False)

test_size = int(TEST_SHARE * len(pairs))
train_size = len(pairs) - test_size

train_pairs, val_pairs = train_test_split(pairs, test_size=TEST_SHARE, random_state=42)
val_pairs, test_pairs = train_test_split(val_pairs, test_size=TEST_SHARE, random_state=42)


In [7]:
train_dataset = Dataset.from_pandas(pd.DataFrame(data=train_pairs))
val_dataset = Dataset.from_pandas(pd.DataFrame(data=val_pairs))
test_dataset = Dataset.from_pandas(pd.DataFrame(data=test_pairs))


In [8]:
source_lang = "en"
target_lang = "ru"

def preprocess_function(
        pairs: Dataset,
        source_lang: str = "en",
        target_lang: str = "ru",
        enc_tokenizer=tokenizer,
        dec_tokenizer=dec_tokenizer,
    ):
    inputs = pairs[source_lang]
    targets = pairs[target_lang]

    max_input_length = int(np.percentile([len(s) for s in inputs], 95))
    max_target_length = int(np.percentile([len(s) for s in targets], 95))
    max_length = max(max_input_length, max_target_length)

    model_inputs = enc_tokenizer(
        inputs,
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    labels = dec_tokenizer(
        text_target=targets,
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["attention_mask"] = labels["attention_mask"]

    return model_inputs

In [9]:
return_cols = ['input_ids', 'attention_mask', 'labels']

tok_train_dataset = train_dataset.map(preprocess_function, batched=True)
tok_train_dataset.set_format(type="torch", columns=return_cols)
tok_val_dataset = val_dataset.map(preprocess_function, batched=True)
tok_val_dataset.set_format(type="torch", columns=return_cols)
tok_test_dataset = test_dataset.map(preprocess_function, batched=True)
tok_test_dataset.set_format(type="torch", columns=return_cols)


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

# TRAIN

In [10]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from nltk.translate.bleu_score import corpus_bleu
import evaluate

metric = evaluate.load("sacrebleu")
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="max_length",
    label_pad_token_id=tokenizer.pad_token_id,
)

In [11]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = dec_tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, dec_tokenizer.pad_token_id)
    decoded_labels = dec_tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    try:
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        result = {"bleu_hf": result["bleu"]}
    except ZeroDivisionError:
        result = {"bleu_hf": 0.}

    result["bleu_nltk"] = corpus_bleu(decoded_labels, decoded_preds)

    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [12]:
batch_size = 4
model_name = "bert2bert"

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "steps",
    eval_steps=2000,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    gradient_accumulation_steps=2,
    fp16=True,
)

In [13]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tok_train_dataset,
    eval_dataset=tok_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss




OverflowError: out of range integral type conversion attempted