In [1]:
# --------------------------------------------
# Cell 1: Initial imports and environment setup
# --------------------------------------------
import torch
import numpy as np
import os
import json
from tqdm import tqdm
from datasets import load_dataset
from sacrebleu import corpus_bleu
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

os.makedirs("./saved_models", exist_ok=True)
os.makedirs("./results", exist_ok=True)
os.makedirs("./my_results", exist_ok=True)

os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

NumPy version: 1.25.2
PyTorch version: 2.5.1+cu118
CUDA available: True
GPU name: Tesla V100-SXM2-32GB


In [2]:
# Cell 2: Load Base Model & Tokenizer (Baseline)
# --------------------------------------------
model_name = "meta-llama/Llama-2-7b-hf"
base_model_save_path = "./saved_models/base_llama"

def load_or_download_base_model():
    if os.path.exists(base_model_save_path):
        print("Loading saved base model...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_save_path)
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_save_path,
            device_map="auto",
            torch_dtype=torch.float16
        )
    else:
        print("Downloading base model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        # Save the model and tokenizer
        print("Saving base model...")
        tokenizer.save_pretrained(base_model_save_path)
        base_model.save_pretrained(base_model_save_path)
        print(f"Base model saved to {base_model_save_path}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return tokenizer, base_model

# Load/download the model
tokenizer, base_model = load_or_download_base_model()
print("Base model and tokenizer ready.")

Downloading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Saving base model...
Base model saved to ./saved_models/base_llama
Base model and tokenizer ready.


In [3]:
# Cell 3: Load Evaluation Data & Get Baseline Scores
# --------------------------------------------
def build_prompt_for_translation(german_text: str) -> str:
    prompt = (
        "You are an expert German-English translator with deep knowledge of both languages.\n\n"
        "Instructions:\n"
        "- Translate the German text into natural, fluent English\n"
        "- Maintain the original meaning and tone\n"
        "- Use appropriate idioms and expressions\n"
        "- Ensure cultural nuances are properly conveyed\n\n"
        f"German text:\n{german_text}\n\n"
        "English translation:"
    )
    return prompt

def debug_evaluate_model(
    model,
    tokenizer,
    eval_dataset,
    num_examples=20,
    debug_print=3,
    description="Model"
):
    comet_metric = evaluate.load("comet")
    predictions = []
    references = []
    sources = []

    subset = eval_dataset.select(range(min(num_examples, len(eval_dataset))))
    print(f"\n[DEBUG EVAL] {description} on {num_examples} examples...\n")

    for i, ex in enumerate(tqdm(subset, desc=f"Evaluating {description}")):
        src_de = ex["translation"]["de"]
        ref_en = ex["translation"]["en"]

        prompt_text = build_prompt_for_translation(src_de)

        tokenized_input = tokenizer(
            prompt_text,
            return_tensors="pt",
            add_special_tokens=True
        ).to(model.device)

        with torch.no_grad():
            output_ids = model.generate(
                **tokenized_input,
                max_new_tokens=256,
                num_beams=4,
                do_sample=False,
                early_stopping=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        full_output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Fixed extraction logic
        if "English translation:" in full_output_text:
            pred_en = full_output_text.split("English translation:")[-1].strip()
        else:
            pred_en = full_output_text.split("German text:")[-1].strip()

        predictions.append(pred_en)
        references.append([ref_en])
        sources.append(src_de)

        if i < debug_print:
            print("\n==========================================")
            print(f"Example {i}")
            print("---------------[ PROMPT ]-----------------")
            print(prompt_text)
            print("--------------[ TOKENIZED ]---------------")
            print(f"Input IDs: {tokenized_input['input_ids'][0].tolist()}")
            print("-----------[ FULL MODEL OUTPUT ]----------")
            print(repr(full_output_text))
            print("-------------[ EXTRACTED EN ]-------------")
            print(repr(pred_en))
            print("--------------[ REFERENCE ]---------------")
            print(ref_en)
            print("==========================================\n")

    bleu = corpus_bleu(predictions, references)
    print(f"[{description}] BLEU = {bleu.score:.2f}")

    comet_results = comet_metric.compute(
        predictions=predictions,
        references=[r[0] for r in references],
        sources=sources
    )
    print(f"[{description}] COMET = {comet_results['mean_score']:.3f}\n")

    return {
        "predictions": predictions,
        "references": references,
        "bleu": bleu.score,
        "comet": comet_results["mean_score"]
    }

def load_eval_data(num_examples=50):
    print(f"Loading WMT19 (de-en) validation data with {num_examples} examples...")
    eval_data = load_dataset("wmt19", "de-en", split="validation")
    eval_data = eval_data.select(range(min(num_examples, len(eval_data))))
    return eval_data

# Get baseline scores immediately
eval_dataset = load_eval_data(num_examples=50)
baseline_results_path = "./results/baseline_results.json"

if os.path.exists(baseline_results_path):
    print("Loading saved baseline results...")
    with open(baseline_results_path, 'r') as f:
        baseline_debug_results = json.load(f)
    print(f"Baseline -> BLEU = {baseline_debug_results['bleu']:.2f}, COMET = {baseline_debug_results['comet']:.3f}")
else:
    print("Evaluating baseline model...")
    baseline_debug_results = debug_evaluate_model(
        model=base_model,
        tokenizer=tokenizer,
        eval_dataset=eval_dataset,
        num_examples=20,
        debug_print=3,
        description="Baseline LLaMA"
    )
    # Save results
    with open(baseline_results_path, 'w') as f:
        json.dump(baseline_debug_results, f)

print("Baseline evaluation complete.")

Loading WMT19 (de-en) validation data with 50 examples...
Evaluating baseline model...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']



[DEBUG EVAL] Baseline LLaMA on 20 examples...



Evaluating Baseline LLaMA:   5%|â–ˆâ–ˆâ–Š                                                      | 1/20 [00:02<00:52,  2.74s/it]


Example 0
---------------[ PROMPT ]-----------------
You are an expert German-English translator with deep knowledge of both languages.

Instructions:
- Translate the German text into natural, fluent English
- Maintain the original meaning and tone
- Use appropriate idioms and expressions
- Ensure cultural nuances are properly conveyed

German text:
MÃ¼nchen 1856: Vier Karten, die Ihren Blick auf die Stadt verÃ¤ndern

English translation:
--------------[ TOKENIZED ]---------------
Input IDs: [1, 887, 526, 385, 17924, 5332, 29899, 24636, 5578, 1061, 411, 6483, 7134, 310, 1716, 10276, 29889, 13, 13, 3379, 582, 1953, 29901, 13, 29899, 4103, 9632, 278, 5332, 1426, 964, 5613, 29892, 1652, 8122, 4223, 13, 29899, 341, 2365, 475, 278, 2441, 6593, 322, 16225, 13, 29899, 4803, 8210, 1178, 29875, 4835, 322, 12241, 13, 29899, 22521, 545, 16375, 4948, 2925, 526, 6284, 27769, 287, 13, 13, 29954, 3504, 1426, 29901, 13, 29924, 3346, 2724, 29871, 29896, 29947, 29945, 29953, 29901, 23650, 476, 8109, 29

Evaluating Baseline LLaMA:  10%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹                                                   | 2/20 [00:03<00:30,  1.71s/it]


Example 1
---------------[ PROMPT ]-----------------
You are an expert German-English translator with deep knowledge of both languages.

Instructions:
- Translate the German text into natural, fluent English
- Maintain the original meaning and tone
- Use appropriate idioms and expressions
- Ensure cultural nuances are properly conveyed

German text:
Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.

English translation:
--------------[ TOKENIZED ]---------------
Input IDs: [1, 887, 526, 385, 17924, 5332, 29899, 24636, 5578, 1061, 411, 6483, 7134, 310, 1716, 10276, 29889, 13, 13, 3379, 582, 1953, 29901, 13, 29899, 4103, 9632, 278, 5332, 1426, 964, 5613, 29892, 1652, 8122, 4223, 13, 29899, 341, 2365, 475, 278, 2441, 6593, 322, 16225, 13, 29899, 4803, 8210, 1178, 29875, 4835, 322, 12241, 13, 29899, 22521, 545, 16375, 4948, 2925, 526, 6284, 27769, 287, 13, 13, 29954, 3504, 1426, 29901, 13, 29923, 457, 6600, 1267, 29899, 2744, 303, 1997, 29892, 8879, 2160, 12843, 19472, 4545, 

Evaluating Baseline LLaMA:  15%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ                                                | 3/20 [00:05<00:26,  1.53s/it]


Example 2
---------------[ PROMPT ]-----------------
You are an expert German-English translator with deep knowledge of both languages.

Instructions:
- Translate the German text into natural, fluent English
- Maintain the original meaning and tone
- Use appropriate idioms and expressions
- Ensure cultural nuances are properly conveyed

German text:
Eine Gruftkapelle, wo nun fÃ¼r den S-Bahn-Tunnel gegraben wird.

English translation:
--------------[ TOKENIZED ]---------------
Input IDs: [1, 887, 526, 385, 17924, 5332, 29899, 24636, 5578, 1061, 411, 6483, 7134, 310, 1716, 10276, 29889, 13, 13, 3379, 582, 1953, 29901, 13, 29899, 4103, 9632, 278, 5332, 1426, 964, 5613, 29892, 1652, 8122, 4223, 13, 29899, 341, 2365, 475, 278, 2441, 6593, 322, 16225, 13, 29899, 4803, 8210, 1178, 29875, 4835, 322, 12241, 13, 29899, 22521, 545, 16375, 4948, 2925, 526, 6284, 27769, 287, 13, 13, 29954, 3504, 1426, 29901, 13, 29923, 457, 5430, 615, 21474, 1808, 29892, 8879, 11923, 1865, 972, 317, 29899, 29933, 

Evaluating Baseline LLaMA: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [00:50<00:00,  2.52s/it]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[Baseline LLaMA] BLEU = 64.93


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Baseline LLaMA] COMET = 0.786

Baseline evaluation complete.


In [5]:
 #--------------------------------------------
# Cell 4: Prepare Training Data with Improved Prompt
# --------------------------------------------
import datasets
from datasets import load_dataset
def build_full_text(example):
    german = example["translation"]["de"]
    english = example["translation"]["en"]
    prompt = (
        "You are an expert German-English translator with deep knowledge of both languages.\n\n"
        "Instructions:\n"
        "- Translate the German text into natural, fluent English\n"
        "- Maintain the original meaning and tone\n"
        "- Use appropriate idioms and expressions\n"
        "- Ensure cultural nuances are properly conveyed\n\n"
        f"German text:\n{german}\n\n"
        "English translation:"
    )
    full_text = prompt + " " + english
    return {"full_text": full_text}

def load_and_format_wmt(num_examples=5000):
    dataset_save_path = f"./saved_datasets/wmt19_train_{num_examples}"
    
    if os.path.exists(dataset_save_path):
        print(f"Loading saved WMT19 dataset from {dataset_save_path}...")
        dataset = datasets.load_from_disk(dataset_save_path)
    else:
        print(f"Downloading WMT19 (de-en) train data with {num_examples} examples...")
        dataset = load_dataset("wmt19", "de-en", split="train")
        dataset = dataset.shuffle(seed=42).select(range(num_examples))
        
        dataset = dataset.map(
            build_full_text,
            desc="Building prompt + target text",
            remove_columns=dataset.column_names
        )
        
        os.makedirs("./saved_datasets", exist_ok=True)
        dataset.save_to_disk(dataset_save_path)
        print(f"Dataset saved to {dataset_save_path}")
    
    return dataset

class PromptMaskCollator:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, examples):
        texts = [ex["full_text"] for ex in examples]
        
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]
        labels = input_ids.clone()
        
        for i, text in enumerate(texts):
            if "English translation:" in text:
                prompt_part, _ = text.split("English translation:", 1)
                prompt_part = prompt_part + "English translation:"
            else:
                prompt_part = text
            
            prompt_ids = self.tokenizer(
                prompt_part,
                truncation=True,
                max_length=self.max_length,
                add_special_tokens=True
            )["input_ids"]
            
            prompt_len = len(prompt_ids)
            if prompt_len > labels.size(1):
                prompt_len = labels.size(1)
            
            labels[i, :prompt_len] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Load training data
training_data = load_and_format_wmt(num_examples=5000)
data_collator = PromptMaskCollator(tokenizer, max_length=512)

print("Training data prepared with prompt masking.")

Downloading WMT19 (de-en) train data with 5000 examples...


Building prompt + target text:   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset saved to ./saved_datasets/wmt19_train_5000
Training data prepared with prompt masking.


In [10]:
# --------------------------------------------
# Cell 5: Updated LoRA Configuration
# --------------------------------------------
from peft import LoraConfig, get_peft_model, TaskType, PeftModel 

def setup_lora_model():
    print("Setting up LoRA model...")
    torch.cuda.empty_cache()
    
    for param in base_model.parameters():
        param.requires_grad = False 
        
    # Configure LoRA
    lora_config = LoraConfig(
        r=16,                    
        lora_alpha=32,          
        lora_dropout=0.1,
        bias="none",
        inference_mode=False,    
        task_type=TaskType.CAUSAL_LM,
        target_modules=[
            "q_proj", 
            "v_proj", 
            "k_proj", 
            "o_proj"
        ]
    )
    
    lora_model = get_peft_model(base_model, lora_config)
    
    for name, param in lora_model.named_parameters():
        if 'lora' in name:
            param.requires_grad = True
    
    lora_model.print_trainable_parameters()
    return lora_model

model_for_training = setup_lora_model()
print("LoRA model is ready.")

Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Setting up LoRA model...
trainable params: 16,777,216 || all params: 6,755,192,832 || trainable%: 0.2484
LoRA model is ready.


In [11]:
import gc
torch.cuda.empty_cache()
gc.collect()

eval_data_training = load_and_format_wmt(num_examples=1000)

model_for_training.train()

train_args = TrainingArguments(
    output_dir="./my_results",
    num_train_epochs=4,           
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,  
    learning_rate=1e-4,
    fp16=False,
    save_steps=1000,
    logging_steps=50,            
    weight_decay=0.05,
    warmup_ratio=0.15,
    max_grad_norm=1.0,
    remove_unused_columns=False,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    load_best_model_at_end=True,  
    evaluation_strategy="steps",   
    eval_steps=100,              
    save_total_limit=2,
    metric_for_best_model="loss",
    optim="adamw_torch",
    ddp_find_unused_parameters=False,
    report_to="tensorboard",     
    logging_dir="./logs",        
    logging_first_step=True      
)

print(f"Training arguments set. Will train for {train_args.num_train_epochs} epochs on ~{len(training_data)} examples.")

trainable_params = []
all_param_size = 0
trainable_param_size = 0

for name, param in model_for_training.named_parameters():
    all_param_size += param.numel()
    if param.requires_grad:
        trainable_params.append(name)
        trainable_param_size += param.numel()
print(f"trainable params: {len(trainable_params)} with total size: {trainable_param_size}")
print(f"all params: {all_param_size}, trainable%: {100 * trainable_param_size / all_param_size:.4f}%")

trainer = Trainer(
    model=model_for_training,
    args=train_args,
    train_dataset=training_data,
    eval_dataset=eval_data_training,
    data_collator=data_collator
)

print("Starting LoRA fine-tuning...")
train_output = trainer.train()
print("\nTraining finished.")

print("Training metrics:")
print(train_output)

trainer.save_model("./my_results/lora_7b")
print("Fine-tuning done. Model saved at ./my_results/lora_7b")

Loading saved WMT19 dataset from ./saved_datasets/wmt19_train_1000...
Training arguments set. Will train for 4 epochs on ~5000 examples.
trainable params: 256 with total size: 16777216
all params: 6755192832, trainable%: 0.2484%
Starting LoRA fine-tuning...




Step,Training Loss,Validation Loss



Training finished.
Training metrics:
TrainOutput(global_step=624, training_loss=1.8458855952589939, metrics={'train_runtime': 7908.0716, 'train_samples_per_second': 2.529, 'train_steps_per_second': 0.079, 'total_flos': 4.04869656801706e+17, 'train_loss': 1.8458855952589939, 'epoch': 3.9792})
Fine-tuning done. Model saved at ./my_results/lora_7b


In [None]:
#base_model_save_path = "./saved_models/base_llama"

In [None]:
# --------------------------------------------
# Cell 7: Verify and Evaluate Fine-tuned Model
# --------------------------------------------
from peft import PeftModel
import evaluate
from tqdm import tqdm
from sacrebleu import corpus_bleu
#import gc
#torch.cuda.empty_cache()
#gc.collect()

def load_lora_model(checkpoint_path="./my_results/lora_7b"):
    print(f"Loading LoRA model from {checkpoint_path}...")
    base = AutoModelForCausalLM.from_pretrained(
        base_model_save_path,
        device_map="auto",
        torch_dtype=torch.float16
    )
    print("Base model loaded. Now loading and merging LoRA weights...")
    
    lora_model_loaded = PeftModel.from_pretrained(base, checkpoint_path)
    print("LoRA weights loaded. Merging weights...")
    
    lora_model_loaded = lora_model_loaded.merge_and_unload()
    print("Weights merged successfully.")
    
    # Verify the models are different
    print("\nVerifying models are different:")
    print(f"Base Model ID: {id(base_model)}")
    print(f"LoRA Model ID: {id(lora_model_loaded)}")
    
    return lora_model_loaded

def compare_translations(model1, model2, tokenizer, text, name1="Base", name2="LoRA"):
    """Compare translations from two models"""
    prompt = build_prompt_for_translation(text)
    
    def get_translation(model, prompt):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=256)
        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "English translation:" in translation:
            translation = translation.split("English translation:")[-1].strip()
        return translation
    
    trans1 = get_translation(model1, prompt)
    trans2 = get_translation(model2, prompt)
    
    print(f"\nGerman: {text}")
    print(f"{name1}: {trans1}")
    print(f"{name2}: {trans2}")
    print("Different?" if trans1 != trans2 else "Same")
    return trans1 != trans2

print("\nEvaluating LoRA-Fine-Tuned Model...")
merged_model = load_lora_model("./my_results/lora_7b")

test_sentences = [
    "Die Sonne scheint heute besonders hell.",
    "KÃ¼nstliche Intelligenz verÃ¤ndert die Welt.",
    "Der kleine Hund spielt im Garten.",
    "MÃ¼nchen ist eine wunderschÃ¶ne Stadt."
]

print("\nComparing translations between base and fine-tuned models:")
differences_found = 0
for text in test_sentences:
    if compare_translations(base_model, merged_model, tokenizer, text):
        differences_found += 1

print(f"\nFound differences in {differences_found}/{len(test_sentences)} translations")

print("\nRunning full evaluation...")
lora_debug_results = debug_evaluate_model(
    model=merged_model,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
    num_examples=20,
    debug_print=3,
    description="LoRA Fine-Tuned"
)

print("\nFinal comparison:")
print(f"Baseline -> BLEU = {baseline_debug_results['bleu']:.2f}, COMET = {baseline_debug_results['comet']:.3f}")
print(f"LoRA     -> BLEU = {lora_debug_results['bleu']:.2f}, COMET = {lora_debug_results['comet']:.3f}")

lora_results_path = "./results/lora_results.json"
with open(lora_results_path, 'w') as f:
    json.dump(lora_debug_results, f)
print(f"LoRA results saved to {lora_results_path}")

We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory:
  - 0: 666910848 bytes required
These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.



Evaluating LoRA-Fine-Tuned Model...
Loading LoRA model from ./my_results/lora_7b...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Base model loaded. Now loading and merging LoRA weights...


We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory:
  - 0: 669008128 bytes required
These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.


LoRA weights loaded. Merging weights...
Weights merged successfully.

Verifying models are different:
Base Model ID: 140174717987088
LoRA Model ID: 140159898659792

Comparing translations between base and fine-tuned models:


