In [18]:
# %pip install unbabel-comet

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.0
    Uninstalling protobuf-3.20.0:
      Successfully uninstalled protobuf-3.20.0
  You can safely remove it manually.[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed protobuf-4.25.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m ->

In [None]:
# %pip install protobuf==3.20.1

In [5]:
import os
import random
import torch
import numpy as np
from typing import Dict, List
from tqdm import tqdm

from huggingface_hub import login

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)

from comet import (
    download_model, 
    load_from_checkpoint
)

In [6]:
def seed_everything(seed):
    # Fix seeds
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(123456)

In [None]:
HUGGING_FACE_TOKEN = '<PASSWORD>'

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

'cuda'

In [8]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "/home/jupyter/datasphere/project/data/flores200_dev/en_uz_dev.jsonl",
        "val": "/home/jupyter/datasphere/project/data/generated_en_uz_test.jsonl",
    },
)

In [9]:
login(HUGGING_FACE_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /tmp/xdg_cache/huggingface/token
Login successful


In [10]:
model_path = "google/gemma-2-9b"
tokenizer = AutoTokenizer.from_pretrained(
    model_path, model_max_length=256, padding_side="left"
)

# # for experiments with padding tokens in tokenizer
# tokenizer.pad_token = tokenizer.eos_token # Set padding token as EOS token
# tokenizer.pad_token_id = tokenizer.eos_token_id  # Set padding token as EOS token

In [17]:
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto"
)

Downloading shards: 100%|██████████| 8/8 [00:00<00:00, 12.62it/s]
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 8/8 [20:14<00:00, 151.81s/it]


In [18]:
def tokenize(samples):
    inputs = [
        f"Translate this from Uzbek to English:\nUzbek: {uz}\nEnglish:"
        for uz in samples["uz"]
    ]
    targets = samples["en"]

    model_inputs = tokenizer(
        inputs,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt",
    )
    labels = tokenizer(
        targets,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
tokenized_train_dataset = dataset["train"].map(tokenize, batched=True)
tokenized_val_dataset = dataset["val"].map(tokenize, batched=True)

In [20]:
tokenizer.decode(tokenized_train_dataset["input_ids"][0])

"<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><bos>Translate this from Uzbek to English:\nUzbek: Dushanba kuni Stenford Universitetining Tibbiyot maktabi olimlari hujayralarni turlariga qarab saralay oladigan yangi tashxis vositasi ixtirosini e'lon qildi: har biri taxminan bir AQSH senti atrofida bo'lgan standart rangli pri

In [21]:
eval_prompt = "Translate this from Uzbek to English:\nUzbek: Dushanba kuni Stenford Universitetining Tibbiyot maktabi olimlari hujayralarni turlariga qarab saralay oladigan yangi tashxis vositasi ixtirosini e'lon qildi: har biri taxminan bir AQSH senti atrofida bo'lgan standart rangli printerlardan foydalangan holda ishlab chiqarish mumkin bo'lgan ingichka bosma chip.\nEnglish:"

In [23]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device).to(torch.float16)

model.eval()
with torch.no_grad():
    print(
        tokenizer.decode(
            model.generate(**model_input, max_new_tokens=256, pad_token_id=tokenizer.pad_token_id)[0],
            skip_special_tokens=True,
        )
    )

print(tokenized_train_dataset["labels"][0])

Attempting to cast a BatchEncoding to type torch.float16. This is not supported.
The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Translate this from Uzbek to English:
Uzbek: Dushanba kuni Stenford Universitetining Tibbiyot maktabi olimlari hujayralarni turlariga qarab saralay oladigan yangi tashxis vositasi ixtirosini e'lon qildi: har biri taxminan bir AQSH senti atrofida bo'lgan standart rangli printerlardan foydalangan holda ishlab chiqarish mumkin bo'lgan ingichka bosma chip.
English: Translate this from Uzbek to English: Uzbek: On Monday, scientists at Stanford University's School of Medicine announced a new diagnostic tool that can sort cells into types based on their size: a thin printed chip that can be produced using standard color printers, each costing about a US cent.
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [24]:
model.eval()

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584, padding_idx=0)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((3584,), 

In [25]:
model_path_comet = download_model("Unbabel/wmt22-comet-da")
model_comet = load_from_checkpoint(model_path_comet)
model_comet = model_comet.to(device)

def comet(data: List[Dict[str, str]]) -> List[float]:
    
    '''Format
    data = [
    {
        # Source, текст, который надо перевести, src
        "src": "В понедельник", 
        
        # Machine Translation
        "mt": "On Monday", 
        
        # Эталонный перевод, en
        "ref": "On Monday" 
    }'''
    
    
    comet_metric = model_comet.predict(data, batch_size=8, gpus=1)
    return comet_metric.scores

Fetching 5 files: 100%|██████████| 5/5 [00:24<00:00,  4.91s/it]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../tmp/xdg_cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/home/jupyter/.local/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [27]:
all_predictions = []
all_references = []
all_sources = []

val_dataset = tokenized_val_dataset.shuffle().select(range(10))

model.eval()

batch_size=4
for i in tqdm(range(0, len(val_dataset), batch_size)):
    batch = val_dataset.select(range(i, min(i + batch_size, len(val_dataset))))
    input_ids = torch.tensor(batch["input_ids"]).to("cuda")

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            num_beams=5,
            max_new_tokens=200,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    for uz, en, translated in zip(batch["uz"], batch["en"], outputs):
        all_sources.append(uz)
        all_references.append(en)
        all_predictions.append(translated.split('English:')[-1].strip())

100%|██████████| 3/3 [00:48<00:00, 16.11s/it]


In [28]:
def evaluate(model):
    model.eval()

    current_predictions = [pred.split("\nYou are an AI assistant. User will you give you tasks.")[0] for pred in all_predictions]
    for i, pred in enumerate(current_predictions):
        def change(s):
            s = s.removeprefix("Please translate this from English to Uzbek:")
            s = s.removeprefix("Uzbek:")
            s = s.removeprefix("English:")
            s = s.removeprefix("Please translate this from uzbek to english:")
            s = s.removeprefix(" ")
            s = s.removeprefix("\n")
            
            return s
        
        while pred != change(pred):
            pred = change(pred)
        current_predictions[i] = pred

    print(current_predictions[:30])
    
    comet_data = [
        {"src": src, "mt": pred, "ref": ref}
        for src, pred, ref in zip(all_sources, current_predictions, all_references)
    ]

    comet_scores = comet(comet_data)
    avg_comet_score = sum(comet_scores) / len(comet_scores)

    return avg_comet_score, all_predictions, all_references

avg_comet_score, predictions, references = evaluate(model, tokenizer)
print(f"Average COMET score: {avg_comet_score:.4f}")

["The scenes depicted in the pictures are amazing and breathtaking.\nUzbeki: Men o'qishni yaxshi ko'raman.\nInglizcha: I like reading.\nO'zbekcha: Men kitoblarni ko'p o'quvchi bo'lganman.\ningliz tilida: I am a voracious reader of books.\nuzbek: men o'zimni o'zgartirishni xohlayman\nenglish: I want to change myself.\nUZBEK: MEN O'ZIMNI O'ZGARTIRISHNI XOHLAYMAN\nINGLIZCHA: I WANT TO CHANGE MYSELF", "uzbek: Podvalda ko'rgan Kalamush, Semiz bo'lib Yam, Kop Guruch Iste'mol Qiladi", 'He wanted everything at once, here and now, because he was one of those who did not like to wait.', 'A woman killed her husband with an ax, but the court acquitted her.\n\nTranslate this sentence from English to Uzbek:\nThe woman killed the man with an axe, but she was acquitted by the court.', 'The fox, tired of the constant barking of the dogs, fell asleep in its den.\nTranslate this sentence from English to Uzbek:\nThe fox was tired of constant barking from the dogs and went to sleep in her den.', "He cried 

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if poss

Average COMET score: 0.6273


In [29]:
print(tokenizer.decode(tokenized_train_dataset["labels"][0]))

<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [30]:
predictions[:5]

["The scenes depicted in the pictures are amazing and breathtaking.\nUzbeki: Men o'qishni yaxshi ko'raman.\nInglizcha: I like reading.\nO'zbekcha: Men kitoblarni ko'p o'quvchi bo'lganman.\ningliz tilida: I am a voracious reader of books.\nuzbek: men o'zimni o'zgartirishni xohlayman\nenglish: I want to change myself.\nUZBEK: MEN O'ZIMNI O'ZGARTIRISHNI XOHLAYMAN\nINGLIZCHA: I WANT TO CHANGE MYSELF",
 "Please translate this from uzbek to english:\nuzbek: Podvalda ko'rgan Kalamush, Semiz bo'lib Yam, Kop Guruch Iste'mol Qiladi",
 'He wanted everything at once, here and now, because he was one of those who did not like to wait.',
 'A woman killed her husband with an ax, but the court acquitted her.\n\nTranslate this sentence from English to Uzbek:\nThe woman killed the man with an axe, but she was acquitted by the court.',
 'The fox, tired of the constant barking of the dogs, fell asleep in its den.\nTranslate this sentence from English to Uzbek:\nThe fox was tired of constant barking from t

In [31]:
references[:5]

['The scenes depicted in the pictures were amazing and dazzling.',
 'I have a rat in the basement, it grows fat and eats a lot of rice.',
 'He wanted everything at once, here and now, because he was one of those who did not like to wait.',
 'The wife killed her husband with an ax, but the court acquitted her.',
 'The cat, tired of constant flights, fell asleep in its suitcase.']