In [1]:
# --------------------------------------------
# Cell 1: Initial imports and environment setup
# --------------------------------------------
import torch
import numpy as np
import os
from tqdm import tqdm
from datasets import load_dataset
from sacrebleu import corpus_bleu
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

# (Optional) Add your Hugging Face token if needed
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

NumPy version: 1.25.2
PyTorch version: 2.5.1+cu118
CUDA available: True
GPU name: NVIDIA A100-PCIE-40GB


In [2]:
# --------------------------------------------
# Cell 2: Load Base Model & Tokenizer (Baseline)
# --------------------------------------------
model_name = "meta-llama/Llama-2-7b-hf"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading base model in float16 with device_map='auto'...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

# Ensure a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Baseline model and tokenizer loaded.")


Loading tokenizer...
Loading base model in float16 with device_map='auto'...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Baseline model and tokenizer loaded.


In [3]:
# --------------------------------------------
# Cell 3: Prepare WMT19 Data (Prompt Masking)
# --------------------------------------------
def build_full_text(example):
    german = example["translation"]["de"]
    english = example["translation"]["en"]
    prompt = (
        "Translate this German text into fluent English.\n"
        f"{german}\n"
        "Translation:"
    )
    full_text = prompt + " " + english
    return {"full_text": full_text}

def load_and_format_wmt(num_examples=50000):
    print(f"Loading WMT19 (de-en) train data with {num_examples} examples...")
    dataset = load_dataset("wmt19", "de-en", split="train")
    dataset = dataset.shuffle(seed=42).select(range(num_examples))
    
    dataset = dataset.map(
        build_full_text,
        desc="Building prompt + target text",
        remove_columns=dataset.column_names
    )
    return dataset

class PromptMaskCollator:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, examples):
        texts = [ex["full_text"] for ex in examples]
        
        tokenized = self.tokenizer(
            texts,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]
        labels = input_ids.clone()
        
        for i, text in enumerate(texts):
            if "Translation:" in text:
                prompt_part, _ = text.split("Translation:", 1)
                prompt_part = prompt_part + "Translation:"
            else:
                prompt_part = text
            
            prompt_ids = self.tokenizer(
                prompt_part,
                truncation=True,
                max_length=self.max_length,
                add_special_tokens=True
            )["input_ids"]
            
            prompt_len = len(prompt_ids)
            if prompt_len > labels.size(1):
                prompt_len = labels.size(1)
            
            labels[i, :prompt_len] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

# Load training data for fine-tuning
training_data = load_and_format_wmt(num_examples=50000)
data_collator = PromptMaskCollator(tokenizer, max_length=512)

print("Training data prepared with prompt masking.")


Loading WMT19 (de-en) train data with 50000 examples...


Building prompt + target text:   0%|          | 0/50000 [00:00<?, ? examples/s]

Training data prepared with prompt masking.


In [4]:
# --------------------------------------------
# Cell 4: LoRA Configuration and Model
# --------------------------------------------
from peft import LoraConfig, get_peft_model, TaskType

def setup_lora_model():
    print("Setting up LoRA model...")
    torch.cuda.empty_cache()
    
    lora_base = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16
    )
    # Freeze embeddings
    for param in lora_base.get_input_embeddings().parameters():
        param.requires_grad = False
    
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
    )
    
    lora_model = get_peft_model(lora_base, lora_config)
    lora_model.print_trainable_parameters()
    return lora_model

model_for_training = setup_lora_model()

print("LoRA model is ready.")


Setting up LoRA model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243
LoRA model is ready.


In [5]:
# --------------------------------------------
# Cell 5: Training Arguments
# --------------------------------------------
train_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,  # 1 epoch over 50k lines
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    learning_rate=1e-5,
    fp16=True,
    save_steps=500,
    logging_steps=100,
    weight_decay=0.01,
    warmup_steps=100,
    # Crucial to avoid the "no columns match" error:
    remove_unused_columns=False
)

print("Training arguments set.")


Training arguments set.


In [6]:
# --------------------------------------------
# Cell 6: Initialize Trainer & Start Fine-tuning
# --------------------------------------------
from transformers import Trainer

trainer = Trainer(
    model=model_for_training,
    args=train_args,
    train_dataset=training_data,
    data_collator=data_collator
)

print("Starting LoRA fine-tuning...\n")
train_output = trainer.train()
print("\nTraining finished.")

print("Training metrics:")
print(train_output)

print("Fine-tuning done.")


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting LoRA fine-tuning...



Step,Training Loss
100,29.7164
200,1.663
300,0.0674
400,0.0615
500,0.0622
600,0.0636
700,0.0631
800,0.0598
900,0.0605
1000,0.0604



Training finished.
Training metrics:
TrainOutput(global_step=1562, training_loss=2.0626527741074407, metrics={'train_runtime': 7031.6059, 'train_samples_per_second': 7.111, 'train_steps_per_second': 0.222, 'total_flos': 1.015851293136126e+18, 'train_loss': 2.0626527741074407, 'epoch': 0.99968})
Fine-tuning done.


In [7]:
# --------------------------------------------
# Cell 7: Load Evaluation Data
# --------------------------------------------
def load_eval_data(num_examples=100):
    print(f"Loading WMT19 (de-en) validation data with {num_examples} examples...")
    eval_data = load_dataset("wmt19", "de-en", split="validation")
    eval_data = eval_data.select(range(min(num_examples, len(eval_data))))
    return eval_data

eval_dataset = load_eval_data(num_examples=100)

print("Validation data loaded.")


Loading WMT19 (de-en) validation data with 100 examples...
Validation data loaded.


In [8]:
# --------------------------------------------
# Cell 8: Universal Evaluation Function & Baseline Eval
# --------------------------------------------
def evaluate_model(
    model, 
    tokenizer, 
    eval_dataset, 
    num_examples=20, 
    num_to_show=5, 
    description="Model"
):
    """
    Evaluate any given `model` on `eval_dataset`.
    Returns BLEU & COMET. Prints a few translations.
    """
    print(f"Evaluating {description} on {num_examples} examples. Showing {num_to_show}...\n")
    
    subset = eval_dataset.select(range(num_examples))
    translations = []
    references = []
    
    for i, ex in enumerate(tqdm(subset, desc=f"Evaluating {description}")):
        src_de = ex["translation"]["de"]
        ref_en = ex["translation"]["en"]
        
        prompt = (
            "Translate this German text into fluent English.\n"
            f"{src_de}\n"
            "Translation:"
        )
        
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                num_beams=4,
                do_sample=False,
                early_stopping=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "Translation:" in full_output:
            pred_en = full_output.split("Translation:")[-1].strip()
        else:
            pred_en = full_output
        
        translations.append(pred_en)
        references.append([ref_en])
        
        if i < num_to_show:
            print(f"Example {i}")
            print("  Source (DE):", src_de)
            print("  Predicted (EN):", pred_en)
            print("  Reference (EN):", ref_en)
            print("-"*80)
    
    # Calculate BLEU
    bleu = corpus_bleu(translations, references)
    print(f"\n[{description}] BLEU score: {bleu.score}")

    # Calculate COMET
    comet_metric = evaluate.load("comet")
    comet_results = comet_metric.compute(
        predictions=translations,
        references=[r[0] for r in references],
        sources=[ex["translation"]["de"] for ex in subset]
    )
    print(f"[{description}] COMET score: {comet_results['mean_score']}")
    
    return {
        "translations": translations,
        "references": references,
        "bleu": bleu.score,
        "comet": comet_results["mean_score"]
    }

# ---- Evaluate Baseline Model ----
print("Evaluating the *baseline* model...\n")
baseline_results = evaluate_model(
    model=base_model,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset, 
    num_examples=20,   # evaluate on 20 for quick test
    num_to_show=3,
    description="Baseline"
)

print("Baseline evaluation done.")


Evaluating the *baseline* model...

Evaluating Baseline on 20 examples. Showing 3...



Evaluating Baseline:   5%|███▏                                                           | 1/20 [00:06<01:54,  6.03s/it]

Example 0
  Source (DE): München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern
  Predicted (EN): Munich 1856: Four cards that will change the way you look at the city
München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern.
Munich 1856: Four cards that will change the way you look at the city.
München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern. München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern.
  Reference (EN): Munich 1856: Four maps that will change your view of the city
---------------------------------------------------------

Evaluating Baseline:  10%|██████▎                                                        | 2/20 [00:07<01:03,  3.52s/it]

Example 1
  Source (DE): Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
  Predicted (EN): An insane asylum, where today's youth should meet.
  Reference (EN): A mental asylum, where today young people are said to meet.
--------------------------------------------------------------------------------


Evaluating Baseline:  15%|█████████▍                                                     | 3/20 [00:09<00:43,  2.53s/it]

Example 2
  Source (DE): Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.
  Predicted (EN): A graveyard chapel, where now the S-Bahn tunnel is being dug.
  Reference (EN): A crypt chapel, where they are now digging tunnels for the S-Bahn.
--------------------------------------------------------------------------------


Evaluating Baseline: 100%|██████████████████████████████████████████████████████████████| 20/20 [01:05<00:00,  3.27s/it]



[Baseline] BLEU score: 2.689745497366639


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_flo

[Baseline] COMET score: 0.649969045817852
Baseline evaluation done.


In [9]:
# --------------------------------------------
# Cell 9: Evaluate Fine-Tuned Model & Compare
# --------------------------------------------
print("Evaluating the *fine-tuned* model...\n")
finetuned_results = evaluate_model(
    model=model_for_training,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
    num_examples=20,
    num_to_show=3,
    description="Fine-tuned"
)

print("\nComparison:")
print(f"Baseline BLEU: {baseline_results['bleu']:.2f} | COMET: {baseline_results['comet']:.3f}")
print(f"Fine-tuned BLEU: {finetuned_results['bleu']:.2f} | COMET: {finetuned_results['comet']:.3f}")

print("Fine-tuned model evaluation & comparison finished.")


Evaluating the *fine-tuned* model...

Evaluating Fine-tuned on 20 examples. Showing 3...



Evaluating Fine-tuned:   5%|███                                                          | 1/20 [00:00<00:14,  1.27it/s]

Example 0
  Source (DE): München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern
  Predicted (EN): Munich 1856: Four cards that change your view of the city
  Reference (EN): Munich 1856: Four maps that will change your view of the city
--------------------------------------------------------------------------------


Evaluating Fine-tuned:  10%|██████                                                       | 2/20 [00:01<00:13,  1.36it/s]

Example 1
  Source (DE): Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
  Predicted (EN): A lunatic asylum, where today's youth should meet.
  Reference (EN): A mental asylum, where today young people are said to meet.
--------------------------------------------------------------------------------


Evaluating Fine-tuned:  15%|█████████▏                                                   | 3/20 [00:02<00:13,  1.27it/s]

Example 2
  Source (DE): Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.
  Predicted (EN): A crypt chapel, where now the S-Bahn tunnel is being dug.
  Reference (EN): A crypt chapel, where they are now digging tunnels for the S-Bahn.
--------------------------------------------------------------------------------


Evaluating Fine-tuned: 100%|████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.04s/it]



[Fine-tuned] BLEU score: 64.93358309501976


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tok

[Fine-tuned] COMET score: 0.8147880345582962

Comparison:
Baseline BLEU: 2.69 | COMET: 0.650
Fine-tuned BLEU: 64.93 | COMET: 0.815

[Cell 9 complete] Fine-tuned model evaluation & comparison finished.
