In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import os
import sys

root_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
print(root_dir)
assert os.path.exists(root_dir), f'Could not find root directory at {root_dir}'
sys.path.insert(0, root_dir)

from custom_utils.config_handler import read_config, pprint_config

/Users/user010/Desktop/Programming/ML/En2RuTranslator


In [6]:
overrides = ["setup=inference"]
cfg = read_config(overrides=overrides)
pprint_config(cfg)

{
  "root": "/Users/user010/Desktop/Programming/ML/En2RuTranslator",
  "opus_model": {
    "name": "opus-en-ru",
    "model_and_tokenizer_name": "Helsinki-NLP/opus-mt-en-ru"
  },
  "nnlb_model": {
    "name": "nnlb-1.3B-distilled",
    "model_and_tokenizer_name": "facebook/nllb-200-distilled-1.3B"
  },
  "inference_dataset": {
    "path": "/Users/user010/Desktop/Programming/ML/En2RuTranslator/data/processed/model_eval.csv"
  },
  "result_dataset": {
    "path": "/Users/user010/Desktop/Programming/ML/En2RuTranslator/data/processed/model_eval_results.csv",
    "cols": {
      "reference": "target",
      "candidates": [
        "transformer-en-ru",
        "nnlb-1.3B-distilled"
      ]
    }
  }
}


In [17]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
sample_texts = ["Hey, how are you?", "My name is John Smith, I live in the United States of America.",
                "I love NLP and Transformers!"]

def get_translations(model: AutoModelForSeq2SeqLM, tokenizer: AutoTokenizer, sample_texts: list,
                     special_gen_params: dict = None) -> list:
    special_gen_params = special_gen_params or {}
    print("Tokenizing...")
    inputs = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True, max_length=600)
    print("Generating...")
    translated_tokens = model.generate(
            **inputs,
            **special_gen_params,
            max_length=600,
            early_stopping=True
        )
    print("Decoding...")
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

def print_translations(source: list[str], target: list[str]):
    assert len(source) == len(target), "Source and target lists must be of same length"
    for src, tgt in zip(source, target):
        print(f"Source: {src}")
        print(f"Target: {tgt}")
        print()

In [19]:
model_and_tokenizer_name = cfg.nnlb_model.model_and_tokenizer_name
nnlb_tokenizer = AutoTokenizer.from_pretrained(model_and_tokenizer_name)
nnlb_model = AutoModelForSeq2SeqLM.from_pretrained(model_and_tokenizer_name)

In [20]:
translations = get_translations(nnlb_model, nnlb_tokenizer, sample_texts, 
                                special_gen_params={"forced_bos_token_id": nnlb_tokenizer.lang_code_to_id["rus_Cyrl"]})
print_translations(sample_texts, translations)

Tokenizing...
Generating...




Decoding...
Source: Hey, how are you?
Target: Привет, как дела?

Source: My name is John Smith, I live in the United States of America.
Target: Меня зовут Джон Смит, я живу в Соединенных Штатах Америки.

Source: I love NLP and Transformers!
Target: Я люблю НЛП и Трансформеров!



In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_and_tokenizer_name = cfg.opus_model.model_and_tokenizer_name
opus_tokenizer = AutoTokenizer.from_pretrained(model_and_tokenizer_name)
opus_model = AutoModelForSeq2SeqLM.from_pretrained(model_and_tokenizer_name)



In [22]:
translations = get_translations(opus_model, opus_tokenizer, sample_texts)
print_translations(sample_texts, translations)

Tokenizing...
Generating...
Decoding...
Source: Hey, how are you?
Target: Привет, как дела?

Source: My name is John Smith, I live in the United States of America.
Target: Меня зовут Джон Смит, я живу в Соединенных Штатах Америки.

Source: I love NLP and Transformers!
Target: Я люблю NLP и Transformers!



In [23]:
import pandas as pd
inference_ds = pd.read_csv(cfg.inference_dataset.path)
inference_ds.head()

Unnamed: 0,source,target
0,The intention would also be to infiltrate terr...,Террористов также обучают методам проникновени...
1,Officials say that as the latest information a...,"По последним данным представителей власти , в ..."
2,While the Balakot camp was reactivated by the ...,Джаиш-е-Мухаммад возобновили работу террористи...
3,The incident in which Pakistan used drones to ...,В качестве яркого примера новой стратегии паки...
4,Officials tell OneIndia that terror groups wou...,Представители власти рассказали порталу OneInd...


In [24]:
from tqdm import tqdm

batch_size = 32
num_batches = len(inference_ds) // batch_size + 1

In [26]:
# nnlb
translations = []
for i in tqdm(range(num_batches)):
    batch = inference_ds.iloc[i*batch_size:(i+1)*batch_size]["source"].tolist()
    batch_translations = get_translations(nnlb_model, nnlb_tokenizer, batch,
                                          special_gen_params={"forced_bos_token_id": nnlb_tokenizer.lang_code_to_id["rus_Cyrl"]})
    translations.extend(batch_translations)



Tokenizing...
Generating...


  3%|▎         | 1/32 [00:33<17:07, 33.14s/it]

Decoding...
Tokenizing...
Generating...


  6%|▋         | 2/32 [01:01<15:06, 30.21s/it]

Decoding...
Tokenizing...
Generating...


  9%|▉         | 3/32 [01:35<15:33, 32.17s/it]

Decoding...
Tokenizing...
Generating...


 12%|█▎        | 4/32 [02:11<15:43, 33.68s/it]

Decoding...
Tokenizing...
Generating...


 16%|█▌        | 5/32 [02:45<15:10, 33.71s/it]

Decoding...
Tokenizing...
Generating...


 19%|█▉        | 6/32 [03:19<14:40, 33.87s/it]

Decoding...
Tokenizing...
Generating...


 22%|██▏       | 7/32 [03:55<14:20, 34.41s/it]

Decoding...
Tokenizing...
Generating...


 25%|██▌       | 8/32 [04:21<12:40, 31.68s/it]

Decoding...
Tokenizing...
Generating...


 28%|██▊       | 9/32 [04:45<11:16, 29.42s/it]

Decoding...
Tokenizing...
Generating...


 31%|███▏      | 10/32 [05:16<10:57, 29.91s/it]

Decoding...
Tokenizing...
Generating...


 34%|███▍      | 11/32 [05:38<09:39, 27.60s/it]

Decoding...
Tokenizing...
Generating...


 38%|███▊      | 12/32 [06:05<09:08, 27.42s/it]

Decoding...
Tokenizing...
Generating...


 41%|████      | 13/32 [06:37<09:05, 28.72s/it]

Decoding...
Tokenizing...
Generating...


 44%|████▍     | 14/32 [07:33<11:07, 37.06s/it]

Decoding...
Tokenizing...
Generating...


 47%|████▋     | 15/32 [07:59<09:30, 33.58s/it]

Decoding...
Tokenizing...
Generating...


 50%|█████     | 16/32 [08:40<09:32, 35.78s/it]

Decoding...
Tokenizing...
Generating...


 53%|█████▎    | 17/32 [09:18<09:08, 36.58s/it]

Decoding...
Tokenizing...
Generating...


 56%|█████▋    | 18/32 [09:54<08:28, 36.35s/it]

Decoding...
Tokenizing...
Generating...


 59%|█████▉    | 19/32 [10:20<07:09, 33.07s/it]

Decoding...
Tokenizing...
Generating...


 62%|██████▎   | 20/32 [10:41<05:55, 29.63s/it]

Decoding...
Tokenizing...
Generating...


 66%|██████▌   | 21/32 [11:07<05:12, 28.43s/it]

Decoding...
Tokenizing...
Generating...


 69%|██████▉   | 22/32 [11:41<05:02, 30.23s/it]

Decoding...
Tokenizing...
Generating...


 72%|███████▏  | 23/32 [12:26<05:12, 34.67s/it]

Decoding...
Tokenizing...
Generating...


 75%|███████▌  | 24/32 [13:55<06:47, 50.96s/it]

Decoding...
Tokenizing...
Generating...


 78%|███████▊  | 25/32 [14:24<05:10, 44.41s/it]

Decoding...
Tokenizing...
Generating...


 81%|████████▏ | 26/32 [14:59<04:09, 41.57s/it]

Decoding...
Tokenizing...
Generating...


 84%|████████▍ | 27/32 [15:38<03:23, 40.65s/it]

Decoding...
Tokenizing...
Generating...


 88%|████████▊ | 28/32 [16:10<02:32, 38.10s/it]

Decoding...
Tokenizing...
Generating...


 91%|█████████ | 29/32 [16:48<01:54, 38.08s/it]

Decoding...
Tokenizing...
Generating...


 94%|█████████▍| 30/32 [17:16<01:09, 34.99s/it]

Decoding...
Tokenizing...
Generating...


 97%|█████████▋| 31/32 [17:52<00:35, 35.49s/it]

Decoding...
Tokenizing...
Generating...


100%|██████████| 32/32 [18:12<00:00, 34.14s/it]

Decoding...





In [35]:
inference_ds[cfg.nnlb_model.name] = translations

In [28]:
# opus
translations = []
for i in tqdm(range(num_batches)):
    batch = inference_ds.iloc[i*batch_size:(i+1)*batch_size]["source"].tolist()
    batch_translations = get_translations(opus_model, opus_tokenizer, batch)
    translations.extend(batch_translations)

  0%|          | 0/32 [00:00<?, ?it/s]

Tokenizing...
Generating...


  3%|▎         | 1/32 [00:11<05:50, 11.30s/it]

Decoding...
Tokenizing...
Generating...


  6%|▋         | 2/32 [00:22<05:35, 11.17s/it]

Decoding...
Tokenizing...
Generating...


  9%|▉         | 3/32 [00:33<05:19, 11.03s/it]

Decoding...
Tokenizing...
Generating...


 12%|█▎        | 4/32 [00:46<05:28, 11.75s/it]

Decoding...
Tokenizing...
Generating...


 16%|█▌        | 5/32 [00:57<05:09, 11.48s/it]

Decoding...
Tokenizing...
Generating...


 19%|█▉        | 6/32 [01:07<04:46, 11.01s/it]

Decoding...
Tokenizing...
Generating...


 22%|██▏       | 7/32 [01:17<04:31, 10.88s/it]

Decoding...
Tokenizing...
Generating...


 25%|██▌       | 8/32 [01:26<04:02, 10.11s/it]

Decoding...
Tokenizing...
Generating...


 28%|██▊       | 9/32 [01:34<03:41,  9.65s/it]

Decoding...
Tokenizing...
Generating...


 31%|███▏      | 10/32 [01:44<03:33,  9.69s/it]

Decoding...
Tokenizing...
Generating...


 34%|███▍      | 11/32 [01:53<03:15,  9.30s/it]

Decoding...
Tokenizing...
Generating...


 38%|███▊      | 12/32 [02:01<03:03,  9.17s/it]

Decoding...
Tokenizing...
Generating...


 41%|████      | 13/32 [02:08<02:41,  8.52s/it]

Decoding...
Tokenizing...
Generating...


 44%|████▍     | 14/32 [02:22<03:02, 10.12s/it]

Decoding...
Tokenizing...
Generating...


 47%|████▋     | 15/32 [02:39<03:25, 12.06s/it]

Decoding...
Tokenizing...
Generating...


 50%|█████     | 16/32 [03:00<03:54, 14.66s/it]

Decoding...
Tokenizing...
Generating...


 53%|█████▎    | 17/32 [03:13<03:35, 14.34s/it]

Decoding...
Tokenizing...
Generating...


 56%|█████▋    | 18/32 [03:21<02:55, 12.53s/it]

Decoding...
Tokenizing...
Generating...


 59%|█████▉    | 19/32 [03:31<02:31, 11.63s/it]

Decoding...
Tokenizing...
Generating...


 62%|██████▎   | 20/32 [03:39<02:08, 10.67s/it]

Decoding...
Tokenizing...
Generating...


 66%|██████▌   | 21/32 [03:52<02:02, 11.09s/it]

Decoding...
Tokenizing...
Generating...


 69%|██████▉   | 22/32 [04:01<01:46, 10.63s/it]

Decoding...
Tokenizing...
Generating...


 72%|███████▏  | 23/32 [04:13<01:40, 11.12s/it]

Decoding...
Tokenizing...
Generating...


 75%|███████▌  | 24/32 [04:51<02:31, 18.95s/it]

Decoding...
Tokenizing...
Generating...


 78%|███████▊  | 25/32 [04:59<01:50, 15.82s/it]

Decoding...
Tokenizing...
Generating...


 81%|████████▏ | 26/32 [05:11<01:27, 14.60s/it]

Decoding...
Tokenizing...
Generating...


 84%|████████▍ | 27/32 [05:22<01:07, 13.55s/it]

Decoding...
Tokenizing...
Generating...


 88%|████████▊ | 28/32 [05:43<01:03, 15.95s/it]

Decoding...
Tokenizing...
Generating...


 91%|█████████ | 29/32 [05:56<00:45, 15.04s/it]

Decoding...
Tokenizing...
Generating...


 94%|█████████▍| 30/32 [06:06<00:27, 13.53s/it]

Decoding...
Tokenizing...
Generating...


 97%|█████████▋| 31/32 [06:21<00:13, 13.85s/it]

Decoding...
Tokenizing...
Generating...


100%|██████████| 32/32 [06:23<00:00, 11.99s/it]

Decoding...





In [29]:
inference_ds[cfg.opus_model.name] = translations

In [36]:
inference_ds.head()

Unnamed: 0,source,target,transformer-en-ru,nnlb-1.3B-distilled
0,The intention would also be to infiltrate terr...,Террористов также обучают методам проникновени...,"Кроме того, намерение состоит в том, чтобы про...",Имеется в виду также проникновение террористов...
1,Officials say that as the latest information a...,"По последним данным представителей власти , в ...","Официальные лица говорят, что в качестве самой...","Официальные лица говорят, что по последней инф..."
2,While the Balakot camp was reactivated by the ...,Джаиш-е-Мухаммад возобновили работу террористи...,В то время как лагерь в Балакоте был восстанов...,В то время как лагерь Балакота был активирован...
3,The incident in which Pakistan used drones to ...,В качестве яркого примера новой стратегии паки...,"Инцидент, в ходе которого Пакистан использовал...","Инцидент, когда Пакистан использовал беспилотн..."
4,Officials tell OneIndia that terror groups wou...,Представители власти рассказали порталу OneInd...,"Официальные лица сообщают одной Индии, что тер...","Официальные лица говорят OneIndia, что террори..."


In [37]:
inference_ds.to_csv(cfg.result_dataset.path, index=False)