In [1]:
# --------------------------------------------
# Cell 1: Initial imports and environment setup
# --------------------------------------------
import torch
import numpy as np
import os
import json
from tqdm import tqdm
from datasets import load_dataset
from sacrebleu import corpus_bleu
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

# Create necessary directories
os.makedirs("./saved_models", exist_ok=True)
os.makedirs("./results", exist_ok=True)
os.makedirs("./my_results", exist_ok=True)

os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

NumPy version: 1.25.2
PyTorch version: 2.5.1+cu118
CUDA available: True
GPU name: NVIDIA A100-PCIE-40GB


In [2]:
# --------------------------------------------
# Cell 2: Evaluation Function Definition (Need this early)
# --------------------------------------------
def build_prompt_for_translation(german_text: str) -> str:
    prompt = (
        "Translate this German text into fluent, natural English:\n"
        f"German: {german_text}\n"
        "English:"
    )
    return prompt

def debug_evaluate_model(
    model,
    tokenizer,
    eval_dataset,
    num_examples=20,
    debug_print=3,
    description="Model"
):
    comet_metric = evaluate.load("comet")
    predictions = []
    references = []
    sources = []

    subset = eval_dataset.select(range(min(num_examples, len(eval_dataset))))
    print(f"\n[DEBUG EVAL] {description} on {num_examples} examples...\n")

    for i, ex in enumerate(tqdm(subset, desc=f"Evaluating {description}")):
        src_de = ex["translation"]["de"]
        ref_en = ex["translation"]["en"]

        prompt_text = build_prompt_for_translation(src_de)

        tokenized_input = tokenizer(
            prompt_text,
            return_tensors="pt",
            add_special_tokens=True
        ).to(model.device)

        with torch.no_grad():
            output_ids = model.generate(
                **tokenized_input,
                max_new_tokens=256,
                num_beams=4,
                do_sample=False,
                early_stopping=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        full_output_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)

        if "English:" in full_output_text:
            pred_en = full_output_text.rsplit("English:", 1)[-1].strip()
        else:
            pred_en = full_output_text

        predictions.append(pred_en)
        references.append([ref_en])
        sources.append(src_de)

        if i < debug_print:
            print("\n==========================================")
            print(f"Example {i}")
            print("---------------[ PROMPT ]-----------------")
            print(prompt_text)
            print("--------------[ TOKENIZED ]---------------")
            print(f"Input IDs: {tokenized_input['input_ids'][0].tolist()}")
            print("-----------[ FULL MODEL OUTPUT ]----------")
            print(repr(full_output_text))
            print("-------------[ EXTRACTED EN ]-------------")
            print(repr(pred_en))
            print("--------------[ REFERENCE ]---------------")
            print(ref_en)
            print("==========================================\n")

    bleu = corpus_bleu(predictions, references)
    print(f"[{description}] BLEU = {bleu.score:.2f}")

    comet_results = comet_metric.compute(
        predictions=predictions,
        references=[r[0] for r in references],
        sources=sources
    )
    print(f"[{description}] COMET = {comet_results['mean_score']:.3f}\n")

    return {
        "predictions": predictions,
        "references": references,
        "bleu": bleu.score,
        "comet": comet_results["mean_score"]
    }

In [3]:
# --------------------------------------------
# Cell 3: Load/Save Base Model & Get Baseline Scores
# --------------------------------------------
model_name = "meta-llama/Llama-2-7b-hf"
base_model_save_path = "./saved_models/base_llama"

def load_or_download_base_model():
    if os.path.exists(base_model_save_path):
        print("Loading saved base model...")
        tokenizer = AutoTokenizer.from_pretrained(base_model_save_path)
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_save_path,
            device_map="auto",
            torch_dtype=torch.float16
        )
    else:
        print("Downloading base model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        base_model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        # Save the model and tokenizer
        print("Saving base model...")
        tokenizer.save_pretrained(base_model_save_path)
        base_model.save_pretrained(base_model_save_path)
        print(f"Base model saved to {base_model_save_path}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    return tokenizer, base_model

def load_eval_data(num_examples=50):
    print(f"Loading WMT19 (de-en) validation data with {num_examples} examples...")
    eval_data = load_dataset("wmt19", "de-en", split="validation")
    eval_data = eval_data.select(range(min(num_examples, len(eval_data))))
    return eval_data

# Load/download the model
tokenizer, base_model = load_or_download_base_model()
print("Base model and tokenizer ready.")

# Get baseline scores immediately
eval_dataset = load_eval_data(num_examples=50)
baseline_results_path = "./results/baseline_results.json"

if os.path.exists(baseline_results_path):
    print("Loading saved baseline results...")
    with open(baseline_results_path, 'r') as f:
        baseline_debug_results = json.load(f)
    print(f"Baseline -> BLEU = {baseline_debug_results['bleu']:.2f}, COMET = {baseline_debug_results['comet']:.3f}")
else:
    print("Evaluating baseline model...")
    baseline_debug_results = debug_evaluate_model(
        model=base_model,
        tokenizer=tokenizer,
        eval_dataset=eval_dataset,
        num_examples=20,
        debug_print=3,
        description="Baseline LLaMA"
    )
    # Save results
    with open(baseline_results_path, 'w') as f:
        json.dump(baseline_debug_results, f)

print("Baseline evaluation complete.")

Loading saved base model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Base model and tokenizer ready.
Loading WMT19 (de-en) validation data with 50 examples...
Loading saved baseline results...
Baseline -> BLEU = 48.89, COMET = 0.542
Baseline evaluation complete.


In [4]:
# --------------------------------------------
# Cell 4: Prepare Training Data
# --------------------------------------------
def build_full_text(example):
    german = example["translation"]["de"]
    english = example["translation"]["en"]
    prompt = (
        "Translate this German text into fluent English.\n"
        f"{german}\n"
        "Translation:"
    )
    full_text = prompt + " " + english
    return {"full_text": full_text}

def load_and_format_wmt(num_examples=10000):
    print(f"Loading WMT19 (de-en) train data with {num_examples} examples...")
    dataset = load_dataset("wmt19", "de-en", split="train")
    dataset = dataset.shuffle(seed=42).select(range(num_examples))
    
    dataset = dataset.map(
        build_full_text,
        desc="Building prompt + target text",
        remove_columns=dataset.column_names
    )
    return dataset

class PromptMaskCollator:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, examples):
        texts = [ex["full_text"] for ex in examples]
        
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]
        labels = input_ids.clone()
        
        for i, text in enumerate(texts):
            if "Translation:" in text:
                prompt_part, _ = text.split("Translation:", 1)
                prompt_part = prompt_part + "Translation:"
            else:
                prompt_part = text
            
            prompt_ids = self.tokenizer(
                prompt_part,
                truncation=True,
                max_length=self.max_length,
                add_special_tokens=True
            )["input_ids"]
            
            prompt_len = len(prompt_ids)
            if prompt_len > labels.size(1):
                prompt_len = labels.size(1)
            
            labels[i, :prompt_len] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Load training data
training_data = load_and_format_wmt(num_examples=10000)
data_collator = PromptMaskCollator(tokenizer, max_length=512)

print("Training data prepared with prompt masking.")

Loading WMT19 (de-en) train data with 10000 examples...
Training data prepared with prompt masking.


In [5]:
# --------------------------------------------
# Cell 5: LoRA Configuration and Model Setup
# --------------------------------------------
from peft import LoraConfig, get_peft_model, TaskType

def setup_lora_model():
    print("Setting up LoRA model...")
    torch.cuda.empty_cache()
    
    lora_config = LoraConfig(
        r=16,
        lora_alpha=64,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
    )
    
    lora_model = get_peft_model(base_model, lora_config)  # Use existing base_model
    lora_model.print_trainable_parameters()
    return lora_model

model_for_training = setup_lora_model()
print("LoRA model is ready.")

Setting up LoRA model...
trainable params: 16,777,216 || all params: 6,755,192,832 || trainable%: 0.2484
LoRA model is ready.


In [6]:
# --------------------------------------------
# Cell 6: Training Setup and Execution
# --------------------------------------------
train_args = TrainingArguments(
    output_dir="./my_results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    save_steps=500,
    logging_steps=100,
    weight_decay=0.05,
    warmup_ratio=0.1,
    max_grad_norm=1.0,
    remove_unused_columns=False,
    lr_scheduler_type="cosine",
    ddp_find_unused_parameters=False  # Added this
)

print(f"Training arguments set. Will train for {train_args.num_train_epochs} epochs on ~{len(training_data)} examples.")

trainer = Trainer(
    model=model_for_training,
    args=train_args,
    train_dataset=training_data,
    data_collator=data_collator
)

print("Starting LoRA fine-tuning...")
train_output = trainer.train()
print("\nTraining finished.")

print("Training metrics:")
print(train_output)

# Save the model
trainer.save_model("./my_results/lora_7b")
print("Fine-tuning done. Model saved at ./my_results/lora_7b")

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training arguments set. Will train for 2 epochs on ~10000 examples.
Starting LoRA fine-tuning...


Step,Training Loss
100,5.5582
200,0.0595
300,0.0585
400,0.0532
500,0.0561
600,0.0531



Training finished.
Training metrics:
TrainOutput(global_step=624, training_loss=0.9375207231212885, metrics={'train_runtime': 2539.4354, 'train_samples_per_second': 7.876, 'train_steps_per_second': 0.246, 'total_flos': 4.060092175566766e+17, 'train_loss': 0.9375207231212885, 'epoch': 1.9952})
Fine-tuning done. Model saved at ./my_results/lora_7b


In [7]:
# --------------------------------------------
# Cell 7: Evaluate Fine-tuned Model
# --------------------------------------------
from peft import PeftModel

def load_lora_model(checkpoint_path="./my_results/lora_7b"):
    print(f"Loading LoRA model from {checkpoint_path}...")
    # Load saved base model
    base = AutoModelForCausalLM.from_pretrained(
        base_model_save_path,
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Merge LoRA
    lora_model_loaded = PeftModel.from_pretrained(base, checkpoint_path)
    lora_model_loaded = lora_model_loaded.merge_and_unload()
    return lora_model_loaded

print("\nEvaluating LoRA-Fine-Tuned Model...")
merged_model = load_lora_model("./my_results/lora_7b")
lora_debug_results = debug_evaluate_model(
    model=merged_model,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
    num_examples=20,
    debug_print=3,
    description="LoRA Fine-Tuned"
)

print("\nFinal comparison:")
print(f"Baseline -> BLEU = {baseline_debug_results['bleu']:.2f}, COMET = {baseline_debug_results['comet']:.3f}")
print(f"LoRA     -> BLEU = {lora_debug_results['bleu']:.2f}, COMET = {lora_debug_results['comet']:.3f}")

# Save LoRA results
lora_results_path = "./results/lora_results.json"
with open(lora_results_path, 'w') as f:
    json.dump(lora_debug_results, f)
print(f"LoRA results saved to {lora_results_path}")


Evaluating LoRA-Fine-Tuned Model...
Loading LoRA model from ./my_results/lora_7b...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']



[DEBUG EVAL] LoRA Fine-Tuned on 20 examples...



Evaluating LoRA Fine-Tuned:   5%|██▋                                                   | 1/20 [00:00<00:13,  1.45it/s]


Example 0
---------------[ PROMPT ]-----------------
Translate this German text into fluent, natural English:
German: München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern
English:
--------------[ TOKENIZED ]---------------
Input IDs: [1, 4103, 9632, 445, 5332, 1426, 964, 1652, 8122, 29892, 5613, 4223, 29901, 13, 29954, 3504, 29901, 10864, 29871, 29896, 29947, 29945, 29953, 29901, 23650, 476, 8109, 29892, 762, 306, 13608, 350, 1406, 1622, 762, 5587, 1147, 3140, 824, 13, 24636, 29901]
-----------[ FULL MODEL OUTPUT ]----------
'<s> Translate this German text into fluent, natural English:\nGerman: München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern\nEnglish: Munich 1856: Four cards that will change your view of the city</s>'
-------------[ EXTRACTED EN ]-------------
'Munich 1856: Four cards that will change your view of the city</s>'
--------------[ REFERENCE ]---------------
Munich 1856: Four maps that will change your view of the city



Evaluating LoRA Fine-Tuned:  10%|█████▍                                                | 2/20 [00:01<00:09,  1.87it/s]


Example 1
---------------[ PROMPT ]-----------------
Translate this German text into fluent, natural English:
German: Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
English:
--------------[ TOKENIZED ]---------------
Input IDs: [1, 4103, 9632, 445, 5332, 1426, 964, 1652, 8122, 29892, 5613, 4223, 29901, 13, 29954, 3504, 29901, 11281, 6600, 1267, 29899, 2744, 303, 1997, 29892, 8879, 2160, 12843, 19472, 4545, 1812, 387, 4566, 899, 2435, 29889, 13, 24636, 29901]
-----------[ FULL MODEL OUTPUT ]----------
"<s> Translate this German text into fluent, natural English:\nGerman: Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.\nEnglish: A lunatic asylum, where today's youth should meet.</s>"
-------------[ EXTRACTED EN ]-------------
"A lunatic asylum, where today's youth should meet.</s>"
--------------[ REFERENCE ]---------------
A mental asylum, where today young people are said to meet.



Evaluating LoRA Fine-Tuned:  15%|████████                                              | 3/20 [00:01<00:08,  1.89it/s]


Example 2
---------------[ PROMPT ]-----------------
Translate this German text into fluent, natural English:
German: Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.
English:
--------------[ TOKENIZED ]---------------
Input IDs: [1, 4103, 9632, 445, 5332, 1426, 964, 1652, 8122, 29892, 5613, 4223, 29901, 13, 29954, 3504, 29901, 11281, 5430, 615, 21474, 1808, 29892, 8879, 11923, 1865, 972, 317, 29899, 29933, 5422, 29899, 29911, 16163, 21598, 336, 1785, 4296, 29889, 13, 24636, 29901]
-----------[ FULL MODEL OUTPUT ]----------
'<s> Translate this German text into fluent, natural English:\nGerman: Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.\nEnglish: A graveyard chapel, where now the S-Bahn tunnel is being dug.</s>'
-------------[ EXTRACTED EN ]-------------
'A graveyard chapel, where now the S-Bahn tunnel is being dug.</s>'
--------------[ REFERENCE ]---------------
A crypt chapel, where they are now digging tunnels for the S-Bahn.



Evaluating LoRA Fine-Tuned: 100%|█████████████████████████████████████████████████████| 20/20 [00:12<00:00,  1.64it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[LoRA Fine-Tuned] BLEU = 57.49


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LoRA Fine-Tuned] COMET = 0.792


Final comparison:
Baseline -> BLEU = 48.89, COMET = 0.542
LoRA     -> BLEU = 57.49, COMET = 0.792
LoRA results saved to ./results/lora_results.json
