In [None]:
pip install sacrebleu

In [None]:
pip install unsloth

In [None]:
pip install -U wandb -q

In [None]:
from datasets import Dataset, concatenate_datasets
from sacrebleu import corpus_bleu
from transformers import pipeline
from unsloth import FastLanguageModel
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:

def load_parallel_data(romanian_path, romani_path):
    with open(romanian_path, 'r', encoding='utf-8') as ro_file, \
         open(romani_path, 'r', encoding='utf-8') as roma_file:
        romanian_lines = ro_file.readlines()
        romani_lines = roma_file.readlines()

    assert len(romanian_lines) == len(romani_lines), "Mismatched number of lines!"

    data = {"translation": [{"ro": ro.strip(), "roma": roma.strip()}
                              for ro, roma in zip(romanian_lines, romani_lines)]}
    return Dataset.from_dict(data)

In [None]:
huggingface_dataset = load_parallel_data('/kaggle/input/romani-romanian/romanian.txt', '/kaggle/input/romani-romanian/romani.txt')
huggingface_dataset_shuffled = huggingface_dataset.shuffle(seed = 42)
train_test_split = huggingface_dataset_shuffled.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_eval_split = train_test_split["test"].train_test_split(test_size = 0.5)

In [None]:
print(huggingface_dataset[0])
print(huggingface_dataset_shuffled[0])
print(len(huggingface_dataset))
print("="*50)
print("Length of train set: ",len(train_dataset))
print("Length of test set: ", len(test_eval_split))
print(train_dataset)
print(test_eval_split)
print(test_eval_split["train"][0])
print(test_eval_split["test"][0])

{'translation': {'ro': 'Ne-ai onorat să purtăm al tău nume:', 'roma': 'Hin bari visada prekal amende,'}}
{'translation': {'ro': 'Şi iată că o femeie păcătoasă din cetate a aflat că El era la masă în casa Fariseului: a adus un vas de alabastru cu mir mirositor,', 'roma': 'Sas iek zhuvli bezexali kai sas ando gav, kana ashundia ke O Jesus sas pa e skafidi ando kher le Farizeanongo, andini iek botela pherdo duxi.'}}
10725
Length of train set:  8580
Length of test set:  2
Dataset({
    features: ['translation'],
    num_rows: 8580
})
DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1072
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1073
    })
})
{'translation': {'ro': 'Dar înainte de toate acestea, vor pune mînile pe voi, şi vă vor prigoni: vă vor da pe mîna sinagogelor, vă vor arunca în temniţe, vă vor tîrî înaintea împăraţilor şi înaintea dregătorilor, din pricina Numelui Meu.', 'roma': 'Numa mai anglal sar kodola 

In [None]:

max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.1.8: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,)

In [None]:
# # Function to translate Romanian sentences

# def translate(text):
#     prompt = f"Tradu urmatorul text din limba Română în limba Romani (vorbită de Romii din România):\nRomână: {text}\nRomani:"
#     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to("cuda")
#     outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)

#     decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
#     return decoded.split("\nRomani:")[1]

In [None]:
# from tqdm import tqdm  # Import tqdm for progress tracking

# test_dataset = test_eval_split["test"]  # Get the test dataset

# romanian_sentences = [example["translation"]["ro"] for example in test_dataset]
# romani_ground_truth = [[example["translation"]["roma"]] for example in test_dataset]  # List of lists

# # Generate translations with progress tracking
# translated_sentences = []
# for sentence in tqdm(romanian_sentences[:20], desc="Translating", unit="sentence"):
#     translated_sentences.append(translate(sentence))

# # Compute BLEU score with progress tracking
# bleu_score = corpus_bleu(translated_sentences, romani_ground_truth).score

# print("=" * 50)
# print(f"BLEU Score: {bleu_score}")
# print("=" * 50)

In [None]:
# print(translated_sentences)

In [None]:
# import gc
# for _ in range(5):
#     gc.collect()
#     torch.cuda.empty_cache()

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens",
                      "lm_head",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.1.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [None]:
translate_prompt = """Tradu urmatorul text din limba Română în limba Romani (vorbită de Romii din România):
Română: {}
Romani: {}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    ro_texts = [entry["ro"] for entry in examples["translation"]]  # Extract 'ro'
    roma_texts = [entry["roma"] for entry in examples["translation"]]  # Extract 'roma'

    outputs = [
        translate_prompt.format(ro_text, roma_text) + EOS_TOKEN
        for ro_text, roma_text in zip(ro_texts, roma_texts)
    ]

    return {"text": outputs}

In [None]:
train_mapped = train_dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/8580 [00:00<?, ? examples/s]

In [None]:
import wandb
wandb.login(key="3bab896b9406c75d767c42c46006b519367ec383")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhandacradu[0m ([33mechipa-radu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_mapped,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
,
        warmup_ratio = 0.2,
        num_train_epochs = 4,


        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 100,
        save_strategy="no",
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "llama3.2_3b",
        report_to = "wandb",
    ),
)

Map (num_proc=2):   0%|          | 0/8580 [00:00<?, ? examples/s]

In [None]:

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.342 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,580 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 2,144
 "-____-"     Number of trainable parameters = 836,632,576
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
100,3.1537
200,2.0585
300,1.7245
400,1.533
500,1.4342
600,1.27
700,1.1853
800,1.1535
900,1.1098
1000,1.1025


In [None]:
model.save_pretrained('llama3.2_3b')
tokenizer.save_pretrained('llama3.2_3b')

('llama3.2_3b/tokenizer_config.json',
 'llama3.2_3b/special_tokens_map.json',
 'llama3.2_3b/tokenizer.json')

In [None]:
import gc
for _ in range(5):
    gc.collect()
    torch.cuda.empty_cache()

In [None]:

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

13721.2984 seconds used for training.
228.69 minutes used for training.
Peak reserved memory = 11.16 GB.
Peak reserved memory for training = 5.818 GB.
Peak reserved memory % of max memory = 75.707 %.
Peak reserved memory for training % of max memory = 39.468 %.


In [None]:
selected_sample = "Daţi mai bine milostenie din lucrurile dinlăuntru, şi atunci toate vă vor fi curate."
inference_prompt = """Tradu urmatorul text din limba Română în limba Romani (vorbită de Romii din România):
Română: {}
Romani: """

FastLanguageModel.for_inference(model)
inputs = tokenizer([inference_prompt.format(selected_sample)], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 4096, use_cache = True, do_sample=False)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Tradu urmatorul text din limba Română în limba Romani (vorbită de Romii din România):\nRomână: Daţi mai bine milostenie din lucrurile dinlăuntru, şi atunci toate vă vor fi curate.\nRomani: 1 De mai but love kal chache kai si tume andre, ai antunchi sa vuzhara avena.<|eot_id|>']

In [None]:

prompt_template = """Tradu urmatorul text din limba Română în limba Romani (vorbită de Romii din România):
Română: {}
Romani: """
def translate(texts):
    prompts = [prompt_template.format(text) for text in texts]
    inputs = tokenizer(prompts, return_tensors="pt", truncation=True, padding=True).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True, do_sample=False)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract translated sentences
    translations = [text.split("\nRomani:")[1] for text in decoded]
    return translations

In [None]:
from tqdm import tqdm

test_dataset = test_eval_split["test"]

romanian_sentences = [example["translation"]["ro"] for example in test_dataset]
romani_ground_truth = [[example["translation"]["roma"]] for example in test_dataset]  # List of lists

batch_size = 16
translated_sentences = []


for i in tqdm(range(0, len(romanian_sentences), batch_size), desc="Translating", unit="batch"):
    batch = romanian_sentences[i : i + batch_size]  # Get batch
    translated_sentences.extend(translate(batch))  # Translate and append


bleu_score = corpus_bleu(translated_sentences, romani_ground_truth).score

print("=" * 50)
print(f"BLEU Score: {bleu_score}")
print("=" * 50)

Translating: 100%|██████████| 68/68 [12:02<00:00, 10.62s/batch]

BLEU Score: 21.3643503198117





In [None]:
translated_sentences

[' 1) Den mai but love kal chorhe kai si tume andre, ai antunchi sa vuzhilela.',
 ' 9 O Ježiš phenďa, hoj peske kampel te modľinas, kaj o kraľišagos le Devleskero te avel, a oda, so o Del kamel, te kerel pes upre savore phuv.',
 ' 3 Ando tampla, wo arakhlia kodolen kai bichinenas zhege, ai zhege le bakre ai le gurumlia, ai kodolen kai parhuvenas le love.',
 ' 1 Zhanas, murhe phral, ke O Del drago les tumenge te alosardia tume.',
 ' 3 Ai sar xanas lensa, lia o manrho, ai kana naisilas le Devles, phaglia les, ai dia les te xan.',
 '  Nasul tumenge, Gramnoturia ai Farizeanuria! Tume san sar le greposhevki kai si shukar avrial, numa andral si pherde le mule, ai sa le nasulimata.',
 ' 8 O Jesus phendia lenge, "Tume roden te sikaven ke vorta san angla manush, numa O Del zhanel tumare ile; ke so si baro mashkar le manush si gratsia le Devleske.',
 ' 9 Numa chi dashtinas te den duma pa leste angla narodo, ke chudisaile kana ashunde sar del duma, ai chi mai phende khanch.',
 ' 1. a) So pes ačhi