In [None]:
from pathlib import Path
from tqdm import tqdm
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
def translate(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids.cuda(), num_beams=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded

In [None]:
mname = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = AutoTokenizer.from_pretrained(mname)
model = AutoModelForSeq2SeqLM.from_pretrained(mname).cuda()

In [None]:
mname = "facebook/wmt19-ru-en"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname).cuda()

In [None]:
test_file_path = Path('test.ru.txt')
input_sentenses = test_file_path.read_text().splitlines()
output = [translate(i) for i in tqdm(input_sentenses)]
# Path('answer.txt').write_text('\n'.join(output))
Path('answer_test.txt').write_text('\n'.join(output))

In [None]:
eval_file_path = Path('eval-ru-100.txt')
input_sentenses = eval_file_path.read_text().splitlines()
output = [translate(i) for i in tqdm(input_sentenses)]
Path('answer.txt').write_text('\n'.join(output))
!zip answer.zip answer.txt 

In [None]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1kM4FV2d7tBXmjWx0_Nc-klDZNRoKCRHY'
output = 'UNv1.0.en-ru.tar.gz.00'
gdown.download(url, output, quiet=False)
    
url = 'https://drive.google.com/uc?export=download&id=1T76T6SsB3PL0OUjfGxegbFZaLhz7rUjG'
output = 'UNv1.0.en-ru.tar.gz.01'
gdown.download(url, output, quiet=False)

url = 'https://drive.google.com/uc?export=download&id=17xvY_z-tGgqM-QiC9mwQus5WkxRYQRSI'
output = 'UNv1.0.en-ru.tar.gz.02'
gdown.download(url, output, quiet=False)

!cat UNv1.0.en-ru.tar.gz.* | tar -xzf -

In [None]:
!wget https://s3.amazonaws.com/web-language-models/paracrawl/bonus/en-ru.txt.gz

In [None]:
from pathlib import Path
from tqdm import tqdm
import json

In [None]:
text = Path('en-ru.txt').read_text()
data_processed_1 = [{'translation': dict(zip(['en', 'ru'], i.split('\t')))} for i in text.splitlines()]

In [None]:
en = Path('en-ru/UNv1.0.en-ru.en').read_text().splitlines()
ru = Path('en-ru/UNv1.0.en-ru.ru').read_text().splitlines()
ids = Path('en-ru/UNv1.0.en-ru.ids').read_text().splitlines()
data_processed_2 = [{'translation': {'en': e, 'ru': r}} for r, e in zip(ru, en) if r != e]

In [None]:
data_processed = data_processed_1 + data_processed_2

In [None]:
with open('data_train.json', 'w') as f:
    pass

In [None]:
with open('data_train.json', 'a') as f:
    f.write('[')
    for line in tqdm(data_processed[:-1]):
        f.write(json.dumps(line, ensure_ascii=False) + '\n')
    f.write(json.dumps(data_processed[-1], ensure_ascii=False) + ']')

In [None]:
CUDA_VISIBLE_DEVICES=1 python run_seq2seq.py \
    --model_name_or_path facebook/wmt19-ru-en  \
    --do_train \
    --task translation_ru_to_en \
    --dataset_name wmt19 \
    --dataset_config_name ru-en \
    --output_dir tst-translation \
    --train_file data_train.json \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
    --predict_with_generate \
    --max_train_samples 5000

In [None]:
CUDA_VISIBLE_DEVICES=1 python run_seq2seq.py \
    --model_name_or_path facebook/wmt19-ru-en  \
    --do_train \
    --task translation_ru_to_en \
    --dataset_name wmt19 \
    --dataset_config_name ru-en \
    --source_lang ru_RU \
    --target_lang en_XX \
    --output_dir tst-translation \
    --train_file data_train.json \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
    --predict_with_generate \
    --max_train_samples 5000 \
    --max_val_samples 500