In [1]:
# %pip install -r ./requirements.txt

In [1]:
%pip install bitsandbytes accelerate peft unbabel-comet wandb

Defaulting to user installation because normal site-packages is not writeable
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.0
    Uninstalling protobuf-3.20.0:
      Successfully uninstalled protobuf-3.20.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed protobuf-4.25.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m

In [2]:
# %pip uninstall protobuf -y
%pip install protobuf==3.20

Found existing installation: protobuf 4.25.5
Uninstalling protobuf-4.25.5:
  Successfully uninstalled protobuf-4.25.5
Defaulting to user installation because normal site-packages is not writeable
Collecting protobuf==3.20
  Downloading protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (698 bytes)
Downloading protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unbabel-comet 2.2.2 requires protobuf<5.0.0,>=4.24.4, but you have protobuf 3.20.0 which is incompatible.
google-api-core 2.11.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0

In [None]:
import os
import random
import torch
import wandb
import numpy as np
from typing import Dict, List
from tqdm import tqdm

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)

from comet import (
    download_model,
    load_from_checkpoint
)

In [None]:
lr = 2.5e-5
num_epochs = 1
WANDB_KEY = '<WANDB_KEY>'

In [None]:
wandb.login(key=WANDB_KEY)

AllocationException: VM could not be allocated

In [None]:
run = wandb.init(
    # Set the project where this run will be logged
    project="multilang_mt",
    # Track hyperparameters and run metadata
    config={
        "learning_rate": lr,
        "epochs": num_epochs,
    },
)

In [None]:
def seed_everything(seed):
    # Fix seeds
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(123456)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "/home/jupyter/datasphere/project/data/flores200_dev/en_uz_dev.jsonl",
        "val": "/home/jupyter/datasphere/project/data/flores200_devtest/en_uz_devtest.jsonl",
    },
)

In [None]:
model_path = "haoranxu/X-ALMA-13B-Pretrain"
tokenizer = AutoTokenizer.from_pretrained(
    model_path, model_max_length=256, padding_side="left"
)
# tokenizer.pad_token = tokenizer.eos_token  # Set padding token as EOS token
# tokenizer.pad_token_id = tokenizer.eos_token_id  # Set padding token as EOS token
# tokenizer.pad_token = tokenizer.bos_token  # Set padding token as BOS token
# tokenizer.pad_token_id = tokenizer.bos_token_id  # Set padding token as BOS token

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
model = prepare_model_for_kbit_training(model)

In [None]:
model

In [None]:
def tokenize(samples):
    inputs = [
        f"Translate this from Uzbek to English:\nUzbek: {uz}\nEnglish:"
        for uz in samples["uz"]
    ]
    targets = samples["en"]

    model_inputs = tokenizer(
        inputs,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt",
    )
    labels = tokenizer(
        targets,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train_dataset = dataset["train"].map(tokenize, batched=True)
tokenized_val_dataset = dataset["val"].map(tokenize, batched=True)

In [None]:
tokenizer.decode(tokenized_train_dataset["input_ids"][0])

In [None]:
eval_prompt = "Translate this from Uzbek to English:\nUzbek: Dushanba kuni Stenford Universitetining Tibbiyot maktabi olimlari hujayralarni turlariga qarab saralay oladigan yangi tashxis vositasi ixtirosini e'lon qildi: har biri taxminan bir AQSH senti atrofida bo'lgan standart rangli printerlardan foydalangan holda ishlab chiqarish mumkin bo'lgan ingichka bosma chip.\nEnglish:"

In [None]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to(device)
model.eval()
with torch.no_grad():
    print(
        tokenizer.decode(
            model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0],
            skip_special_tokens=True,
        )
    )

print(tokenized_train_dataset["labels"][0])

In [None]:
# LoRA configuration with reduced lora_alpha
lora_config = LoraConfig(
    r=8,
    lora_alpha=5,  # Reduced lora_alpha for smaller scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"],
    lora_dropout=0.05,
    task_type="SEQ2SEQ_LM",
)
model = get_peft_model(model, lora_config)

In [None]:
model.train()

In [None]:
# Data collator for padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=256)

In [None]:
args = Seq2SeqTrainingArguments(
    f"./checkpoints/xalma-finetune-3.0-turkic",
    warmup_steps=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    # max_steps=300,
    num_train_epochs=num_epochs,
    learning_rate=lr,
    logging_steps=50,  # Reduced logging frequency to every 10 steps
    fp16=True,
    save_strategy="steps",
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    do_eval=True,
    logging_dir="./logs",
    report_to="none",
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
train_result = trainer.train()
model = trainer.model

In [None]:
model.eval()

In [None]:
model_path_comet = download_model("Unbabel/wmt22-comet-da")
model_comet = load_from_checkpoint(model_path_comet)
model_comet = model_comet.to(device)


def comet(data: List[Dict[str, str]]) -> List[float]:
    '''Format
    data = [
    {
        # Source, текст, который надо перевести, src
        "src": "В понедельник", 
        
        # Machine Translation
        "mt": "On Monday", 
        
        # Эталонный перевод, en
        "ref": "On Monday" 
    }'''

    comet_metric = model_comet.predict(data, batch_size=8, gpus=1)
    return comet_metric.scores

In [None]:
data = [
    {
        "src": "Dem Feuer konnte Einhalt geboten werden",
        "mt": "The fire could be stopped",
        "ref": "They were able to control the fire."
    },
    {
        "src": "Schulen und Kindergärten wurden eröffnet.",
        "mt": "Schools and kindergartens were open",
        "ref": "Schools and kindergartens opened"
    }
]
model_output = model_comet.predict(data, batch_size=8, gpus=1)
print(model_output)

In [None]:
all_predictions = []
all_references = []
all_sources = []

# batch_size = 1      # in Kaggle
batch_size = 4  # in DataSphere

val_dataset = tokenized_val_dataset.shuffle().select(range(300))

model.eval()

for i in tqdm(range(0, len(val_dataset), batch_size)):
    batch = val_dataset.select(range(i, min(i + batch_size, len(val_dataset))))
    input_ids = torch.tensor(batch["input_ids"]).to("cuda")

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            num_beams=5,
            max_new_tokens=200,
            no_repeat_ngram_size=3,
            early_stopping=True,
        )
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    for uz, en, translated in zip(batch["uz"], batch["en"], outputs):
        all_sources.append(uz)
        all_references.append(en)
        all_predictions.append(translated.split('English:')[-1].strip())

In [None]:
def evaluate(model):
    model.eval()

    current_predictions = [pred.split("\nYou are an AI assistant. User will you give you tasks.")[0] for pred in
                           all_predictions]
    for i, pred in enumerate(current_predictions):
        def change(s):
            prefixes_to_remove = [
                "Please translate this from English to Uzbek:",
                "Translate this from English to Uzbek:",
                "Translate this sentence to Uzbek:",
                "Please translate this from uzbek to english:",
                "Translate this sentence from English to Uzbek:",
                "Translation:",
                " ",
                "\n"
            ]
            for prefix in prefixes_to_remove:
                s = s.removeprefix(prefix)
            return s

        while pred != change(pred):
            pred = change(pred)

        pred_lines = pred.split('\n')
        pred = '\n'.join([line for line in pred_lines if not any(line.strip().startswith(lang + ":") for lang in
                                                                 ["Spanish", "French", "Uzbek", "German", "Russian",
                                                                  "Chinese", "Japanese", "Italian", "Portuguese",
                                                                  "Arabic"])])

        pred = pred.split("\n")[0]

        current_predictions[i] = pred.strip()

    comet_data = [
        {"src": src, "mt": pred, "ref": ref}
        for src, pred, ref in zip(all_sources, current_predictions, all_references)
    ]

    comet_scores = comet(comet_data)
    avg_comet_score = sum(comet_scores) / len(comet_scores)

    return avg_comet_score, all_predictions, all_references


avg_comet_score, predictions, references = evaluate(model, tokenizer)
print(f"Average COMET score: {avg_comet_score:.4f}")

In [None]:
print(tokenizer.decode(tokenized_train_dataset["labels"][0]))