In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def read_file_lines_to_list(path):
    with open(path, 'r', encoding='utf-8') as file:
         return file.read().split('\n')

In [None]:
import os

path_ru_text = "/content/drive/MyDrive/datasets/translation_ru_tat_tests/test_ru.txt"
path_tat_text = "/content/drive/MyDrive/datasets/translation_ru_tat_tests/test_tat.txt"
path_yandex_ru_tat = "/content/drive/MyDrive/datasets/translation_ru_tat_tests/yandex_test_ru_tat.txt"
path_yandex_tat_ru = "/content/drive/MyDrive/datasets/translation_ru_tat_tests/yandex_test_tat_ru.txt"


text_ru = read_file_lines_to_list(path_ru_text)
text_tat = read_file_lines_to_list(path_tat_text)
translations_yandex_ru_tat = read_file_lines_to_list(path_yandex_ru_tat)
translations_yandex_tat_ru = read_file_lines_to_list(path_yandex_tat_ru)

In [None]:
from transformers import AutoModelForSeq2SeqLM

checkpoint_ru_tat = "/content/drive/MyDrive/models/model_ru_tat_2epochs"
checkpoint_tat_ru = "/content/drive/MyDrive/models/model_tat_ru_2epochs"
checkpoint = "facebook/nllb-200-distilled-600M"


model_ru_tat = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_ru_tat)
model_tat_ru = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_tat_ru)
model_base = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

from transformers import NllbTokenizerFast

tokenizer_ru_tat = NllbTokenizerFast.from_pretrained(checkpoint, src_lang='rus_Cyrl', tgt_lang='tat_Cyrl')
tokenizer_tat_ru = NllbTokenizerFast.from_pretrained(checkpoint, src_lang='tat_Cyrl', tgt_lang='rus_Cyrl')

In [None]:
model_ru_tat.cuda()
model_tat_ru.cuda()
model_base.cuda()

In [None]:
from transformers import pipeline
from tqdm import tqdm
import numpy as np

BATCH_SIZE = 16

translator_base_ru_tat = pipeline("translation", model=model_base, tokenizer=tokenizer_ru_tat,
                                  src_lang='rus_Cyrl', tgt_lang='tat_Cyrl', device=0, num_beams=8, batch_size=BATCH_SIZE)
translator_our_ru_tat = pipeline("translation", model=model_ru_tat, tokenizer=tokenizer_ru_tat,
                                 src_lang='rus_Cyrl', tgt_lang='tat_Cyrl', device=0, num_beams=8, batch_size=BATCH_SIZE)
translator_base_tat_ru = pipeline("translation", model=model_base, tokenizer=tokenizer_tat_ru,
                                  src_lang='tat_Cyrl', tgt_lang='rus_Cyrl', device=0, num_beams=8, batch_size=BATCH_SIZE)
translator_our_tat_ru = pipeline("translation", model=model_tat_ru, tokenizer=tokenizer_tat_ru,
                                 src_lang='tat_Cyrl', tgt_lang='rus_Cyrl', device=0, num_beams=8, batch_size=BATCH_SIZE)

def create_batches(data, batch_size=BATCH_SIZE):
    return [data[i:i+batch_size] for i in range(0, len(data), batch_size)]

text_ru_batches = create_batches(text_ru)
text_tat_batches = create_batches(text_tat)

translations_base_ru_tat = []
translations_our_ru_tat = []
translations_base_tat_ru = []
translations_our_tat_ru = []

print(f"Total batches: {len(text_ru_batches)}")

for batch in tqdm(text_ru_batches, desc="Base ru->tat"):
    results = translator_base_ru_tat(batch)
    translations_base_ru_tat.extend([res['translation_text'] for res in results])

for batch in tqdm(text_ru_batches, desc="Our ru->tat"):
    results = translator_our_ru_tat(batch)
    translations_our_ru_tat.extend([res['translation_text'] for res in results])

for batch in tqdm(text_tat_batches, desc="Base tat->ru"):
    results = translator_base_tat_ru(batch)
    translations_base_tat_ru.extend([res['translation_text'] for res in results])

for batch in tqdm(text_tat_batches, desc="Our tat->ru"):
    results = translator_our_tat_ru(batch)
    translations_our_tat_ru.extend([res['translation_text'] for res in results])

In [None]:
path_translations_ru_tat = "/content/drive/MyDrive/utils/translations/translations_2ep_ru_tat.txt"
path_translations_tat_ru = "/content/drive/MyDrive/utils/translations/translations_2ep_tat_ru.txt"
path_translations_base_ru_tat= "/content/drive/MyDrive/utils/translations/translations_base_ru_tat.txt"
path_translations_base_tat_ru="/content/drive/MyDrive/utils/translations/translations_base_tat_ru.txt"

def save_translations(translations, path):
    translations_to_save = '\n'.join(translations)
    with open(path, 'w') as f:
        f.write(translations_to_save)

save_translations(translations_our_ru_tat, path_translations_ru_tat)
save_translations(translations_our_tat_ru, path_translations_tat_ru)
save_translations(translations_base_ru_tat, path_translations_base_ru_tat)
save_translations(translations_base_tat_ru, path_translations_base_tat_ru)

In [None]:
!pip install evaluate sacrebleu

from datasets import load_metric
import evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m475.9 kB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m16.4 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: 

In [None]:
bleu = evaluate.load("bleu")
sacrebleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

In [None]:
path_base_ru_tat = "/content/drive/MyDrive/utils/translations/translations_base_ru_tat.txt"
path_base_tat_ru = "/content/drive/MyDrive/utils/translations/translations_base_tat_ru.txt"
path_our_ru_tat = "/content/drive/MyDrive/utils/translations/translations_2ep_ru_tat.txt"
path_our_tat_ru = "/content/drive/MyDrive/utils/translations/translations_2ep_tat_ru.txt"


translations_base_ru_tat = read_file_lines_to_list(path_base_ru_tat)
translations_base_tat_ru = read_file_lines_to_list(path_base_tat_ru)
translations_our_ru_tat = read_file_lines_to_list(path_our_ru_tat)
translations_our_tat_ru = read_file_lines_to_list(path_our_tat_ru)

In [None]:
def print_BLEU_Chrf(model_name, lang_dir, predictions, references):
    chrf_result = chrf.compute(
      predictions=predictions,
      references=references,
      word_order=2,
      char_order=6
    )

    sacrebleu_result = sacrebleu.compute(
        predictions=predictions,
        references=references, tokenize="intl", lowercase=True
    )

    print(f"{lang_dir} {model_name}: chrf++ = {round(chrf_result['score'], 3)}, sacreBLEU = {round(sacrebleu_result['score'],3)}")


In [None]:
print_BLEU_Chrf("Yandex_model", "RU-TT", translations_yandex_ru_tat, text_tat)
print_BLEU_Chrf("Base NLLB-200/600M", "RU-TT", translations_base_ru_tat, text_tat)
print_BLEU_Chrf("Our", "RU-TT", translations_our_ru_tat, text_tat)
print()
print_BLEU_Chrf("Yandex_model", "TT-RU", translations_yandex_tat_ru, text_ru)
print_BLEU_Chrf("Base NLLB-200/600M", "TT-RU", translations_base_tat_ru, text_ru)
print_BLEU_Chrf("Our", "TT-RU", translations_our_tat_ru, text_ru)


RU-TT Yandex_model: chrf++ = 35.212, sacreBLEU = 6.45
RU-TT Base NLLB-200/600M: chrf++ = 45.004, sacreBLEU = 15.303
RU-TT Our: chrf++ = 44.175, sacreBLEU = 16.652

TT-RU Yandex_model: chrf++ = 34.561, sacreBLEU = 7.375
TT-RU Base NLLB-200/600M: chrf++ = 45.571, sacreBLEU = 20.511
TT-RU Our: chrf++ = 43.355, sacreBLEU = 18.964
