In [1]:
# Cell 1: Initial imports and setup
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from sacrebleu import corpus_bleu
from comet import download_model, load_from_checkpoint
import os
import evaluate
from tqdm import tqdm

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

# Add HF token
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

NumPy version: 1.25.2
PyTorch version: 2.5.1+cu118
CUDA available: True
GPU name: NVIDIA A100-PCIE-40GB


In [2]:
# Cell 1.5 (New cell to add between 1 and 2)
def print_gpu_memory():
    if torch.cuda.is_available():
        print("Current GPU Memory Usage:")
        print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
        print(f"Cached: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

# Print initial memory state
print_gpu_memory()

Current GPU Memory Usage:
Allocated: 0.00 GB
Cached: 0.00 GB


In [3]:
# Cell 2: Load Base Model and Tokenizer
model_name = "meta-llama/Llama-2-7b-hf"

print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Clear CUDA cache before loading model
torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True  # Add 8-bit quantization
)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

print_gpu_memory()

Loading model and tokenizer...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Current GPU Memory Usage:
Allocated: 6.62 GB
Cached: 6.62 GB


In [7]:
# Cell 3: Load Validation Data
def load_eval_data(num_examples=20):  # Reduced from 100 to 20
    print("Loading WMT validation data...")
    eval_data = load_dataset("wmt19", "de-en", split="validation")
    
    # Take subset for testing
    eval_data = eval_data.select(range(min(num_examples, len(eval_data))))
    return eval_data

eval_dataset = load_eval_data()
print(f"Loaded {len(eval_dataset)} examples for evaluation")

Loading WMT validation data...
Loaded 20 examples for evaluation


In [10]:
# Cell 4: Translation Function
def translate(text, src_lang="German", tgt_lang="English", max_length=128):
    prompt = f"Translate from {src_lang} to {tgt_lang}:\n{text}\nTranslation:"
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            num_beams=4,
            do_sample=False,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )
    
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translation = translation.split("Translation:")[-1].strip()
    translation = translation.split('\n')[0]
    
    return translation

In [12]:
# Cell 5: Initial Evaluation
def evaluate_translations(num_examples=20, num_to_show=10, batch_size=2):
    print(f"Starting translation evaluation on {num_examples} examples...")
    print(f"Showing first {num_to_show} translations...")
    
    # Take subset of evaluation data
    eval_subset = eval_dataset.select(range(num_examples))
    
    translations = []
    references = []
    sources = []  # Add a list to store source texts
    
    # Process in smaller batches
    for i in tqdm(range(0, len(eval_subset), batch_size), desc="Translating"):
        # Get batch indices
        end_idx = min(i + batch_size, len(eval_subset))
        batch = eval_subset.select(range(i, end_idx))
        
        for example in batch:
            source_text = example['translation']['de']
            reference = example['translation']['en']
            
            # Clear cache before each translation
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            
            translation = translate(source_text)
            
            translations.append(translation)
            references.append([reference])
            sources.append(source_text)  # Store source text
            
            if len(translations) <= num_to_show:
                print(f"\nExample {len(translations)-1}:")
                print("Source:", source_text)
                print("Translation:", translation)
                print("Reference:", reference)
    
    # Calculate BLEU score
    bleu = corpus_bleu(translations, references)
    print(f"\nBLEU score: {bleu.score}")
    
    # Calculate COMET score
    comet = evaluate.load('comet')
    comet_scores = comet.compute(
        predictions=translations,
        references=[ref[0] for ref in references],
        sources=sources  # Use stored source texts
    )
    print(f"COMET score: {comet_scores['mean_score']}")
    
    return {
        'translations': translations,
        'references': references,
        'bleu': bleu.score,
        'comet': comet_scores['mean_score']
    }

# Clear cache before evaluation
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Get baseline results
baseline_results = evaluate_translations()

Starting translation evaluation on 20 examples...
Showing first 10 translations...


Translating:   0%|                                                                               | 0/10 [00:00<?, ?it/s]


Example 0:
Source: München 1856: Vier Karten, die Ihren Blick auf die Stadt verändern
Translation: Munich 1856: Four Cards That Will Change Your View of the City
Reference: Munich 1856: Four maps that will change your view of the city


Translating:  10%|███████                                                                | 1/10 [00:19<02:59, 19.92s/it]


Example 1:
Source: Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
Translation: An insane asylum, where today's youth should meet.
Reference: A mental asylum, where today young people are said to meet.

Example 2:
Source: Eine Gruftkapelle, wo nun für den S-Bahn-Tunnel gegraben wird.
Translation: A graveyard chapel, where now the S-Bahn tunnel is being dug.
Reference: A crypt chapel, where they are now digging tunnels for the S-Bahn.


Translating:  20%|██████████████▏                                                        | 2/10 [00:39<02:39, 19.93s/it]


Example 3:
Source: Kleingärtner bewirtschaften den einstigen Grund von Bauern.
Translation: Small gardeners cultivate the former land of farmers.
Reference: Allotment holders cultivate the soil of former farmers.

Example 4:
Source: Die älteste offizielle Karte Münchens fördert spannende Geschichten zu Tage.
Translation: The oldest official map of Munich brings exciting stories to light.
Reference: The oldest official map of Munich brings captivating stories to light.


Translating:  30%|█████████████████████▎                                                 | 3/10 [00:54<02:02, 17.56s/it]


Example 5:
Source: Es nervt, wenn Landkarten nicht aktuell sind.
Translation: It bothers me when maps are not up-to-date.
Reference: It is annoying when geographical maps are not up-to-date.

Example 6:
Source: Das kennt jeder, der sich schon mal aufregen musste, weil das Auto-Navi statt einer Umgehungsstraße eine grüne Wiese anzeigte.
Translation: Everyone who has ever gotten upset because the car navigation system instead of a detour showed a green meadow knows that.
Reference: Anyone who has ever got worked up because the car's sat-nav is showing a green field instead of a bypass knows that.


Translating:  40%|████████████████████████████▍                                          | 4/10 [01:14<01:50, 18.47s/it]


Example 7:
Source: Die historischen Landkarten des digitalen Bayern-Atlases, ein Angebot des Geoportals Bayern der Staatsregierung, sind alles andere als aktuell - doch gerade deshalb sehr aufschlussreich.
Translation: The historical maps of the digital Bavarian Atlas, an offer of the Geoportal Bavaria of the Bavarian State Government, are anything but up-to-date - but precisely for this reason very informative.
Reference: The historical maps of the digital BayernAtlas, an offering from the State Government's Geoportal Bayern, are anything but up-to-date – and yet it is precisely for this reason that they are so informative.

Example 8:
Source: Besonders wenn man sie mit aktuellen Online-Karten vergleicht.
Translation: Particularly if you compare them with current online maps.
Reference: Especially when one compares them with current online maps.


Translating:  50%|███████████████████████████████████▌                                   | 5/10 [01:34<01:34, 18.97s/it]


Example 9:
Source: Dann wird deutlich, wie sich Städte und Gemeinden im Verbreitungsgebiet des Münchner Merkur seit dem 19. Jahrhundert verändert haben.
Translation: Dann wird deutlich, wie sich Städte und Gemeinden im Verbreitungsgebiet des Münchner Merkur seit dem 19. Jahrhundert verändert haben.
Reference: Then it becomes clear how the towns and municipalities in the distribution area of Munich's Merkur newspaper have changed since the 19th century.


Translating: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [03:10<00:00, 19.08s/it]



BLEU score: 22.997519112894437


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to 

COMET score: 0.7877008706331253


In [13]:
# Cell 6: Prepare WMT Training Data
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

def prepare_wmt_training_data(num_examples=5000):
    print(f"Loading WMT training data (first {num_examples} examples)...")
    
    train_data = load_dataset("wmt19", "de-en", split="train")
    train_data = train_data.shuffle(seed=42).select(range(num_examples))
    
    def format_for_training(examples):
        formatted_inputs = []
        formatted_outputs = []
        
        for item in examples['translation']:
            source = item['de']
            target = item['en']
            
            # Format input with prompt
            input_text = f"Translate from German to English:\n{source}\nTranslation: {target}</s>"
            formatted_inputs.append(input_text)
            formatted_outputs.append(target)
        
        return {
            'inputs': formatted_inputs,
            'outputs': formatted_outputs
        }
    
    formatted_data = train_data.map(
        format_for_training, 
        batched=True, 
        remove_columns=train_data.column_names
    )
    
    # Print some examples to verify format
    print("\nVerifying training data format:")
    for i in range(3):
        print(f"\nExample {i}:")
        print("Input:", formatted_data[i]['inputs'])
        print("Output:", formatted_data[i]['outputs'])
    
    print(f"\nPrepared {len(formatted_data)} examples for training")
    return formatted_data

# Prepare training data
training_data = prepare_wmt_training_data()

Loading WMT training data (first 5000 examples)...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]


Verifying training data format:

Example 0:
Input: Translate from German to English:
2 Personen *** 3 Personen
Translation: 2 persons *** 3 persons</s>
Output: 2 persons *** 3 persons

Example 1:
Input: Translate from German to English:
Hintergrund :
Translation: Background :</s>
Output: Background :

Example 2:
Input: Translate from German to English:
Für ELA-Tonsäulen mit M6-Gewindebuchse wie z.B. ETS-215TW/WS, ETS-210TW/WS oder ETS-215/WS
Translation: For PA column speakers with M6 threaded bushing, e.g. ETS-215TW/WS, ETS-210TW/WS or ETS-215/WS</s>
Output: For PA column speakers with M6 threaded bushing, e.g. ETS-215TW/WS, ETS-210TW/WS or ETS-215/WS

Prepared 5000 examples for training


In [14]:
# Cell 7: Setup LoRA Configuration and Model for Training
from peft import LoraConfig, get_peft_model, TaskType

def setup_model_for_training():
    print("Setting up model with LoRA configuration...")
    
    # Clear CUDA cache
    torch.cuda.empty_cache()
    
    # LoRA config
    lora_config = LoraConfig(
        r=8,  # Rank
        lora_alpha=32,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
    )
    
    # Load model
    model_for_training = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    
    # Freeze embedding layer
    for param in model_for_training.get_input_embeddings().parameters():
        param.requires_grad = False
    
    # Add LoRA adapters
    model_for_training = get_peft_model(model_for_training, lora_config)
    
    # Print trainable parameters info
    print("\nModel configuration:")
    model_for_training.print_trainable_parameters()
    
    return model_for_training

# Setup model
model_for_training = setup_model_for_training()

We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Setting up model with LoRA configuration...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.



Model configuration:
trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243


In [15]:
# Cell 8: Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    fp16=True,
    save_steps=100,
    logging_steps=20,
    weight_decay=0.01,
    max_steps=500,
    warmup_steps=100,
)

In [16]:
# Cell 9: Prepare Data Collator and Tokenize Training Data
from transformers import DataCollatorForLanguageModeling

def prepare_training_data():
    print("Preparing training data...")
    
    def tokenize_data(examples):
        model_inputs = tokenizer(
            examples['inputs'],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        
        # Set labels to be the same as inputs for causal LM training
        model_inputs["labels"] = model_inputs["input_ids"].copy()
        
        return model_inputs

    tokenized_data = training_data.map(
        tokenize_data,
        batched=True,
        remove_columns=training_data.column_names,
        desc="Tokenizing data"
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    return tokenized_data, data_collator

# Prepare training data and collator
tokenized_training_data, data_collator = prepare_training_data()

Preparing training data...


Tokenizing data:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
# Cell 10: Start Training
from transformers import Trainer

trainer = Trainer(
    model=model_for_training,
    args=training_args,
    train_dataset=tokenized_training_data,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

In [None]:
# Cell 11: Debug Evaluation
def evaluate_finetuned_model(num_examples=5, num_to_show=5):
    print(f"Debugging fine-tuned model on {num_examples} examples...")
    
    eval_subset = eval_dataset.select(range(num_examples))
    translations = []
    references = []
    
    for i, example in enumerate(eval_subset):
        source_text = example['translation']['de']
        reference = example['translation']['en']
        
        # Print exact prompt being used
        prompt = f"Translate from German to English:\n{source_text}\nTranslation:"
        print(f"\nExample {i} - Full Prompt:")
        print("="*50)
        print(prompt)
        print("="*50)
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model_for_training.device)
        
        # Print tokenized input
        print("\nTokenized input:")
        print(tokenizer.decode(inputs['input_ids'][0]))
        
        with torch.no_grad():
            outputs = model_for_training.generate(
                **inputs,
                max_new_tokens=128,
                num_beams=4,
                do_sample=False,
                early_stopping=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                return_dict_in_generate=True,
                output_scores=True  # Get token scores
            )
        
        # Get full output including intermediate tokens
        full_output = tokenizer.decode(outputs.sequences[0], skip_special_tokens=False)
        translation = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
        translation = translation.split("Translation:")[-1].strip()
        
        print("\nFull model output (including special tokens):")
        print(full_output)
        print("\nExtracted translation:")
        print(translation)
        print("\nReference:")
        print(reference)
        print("-"*80)
        
        translations.append(translation)
        references.append([reference])
    
    bleu = corpus_bleu(translations, references)
    print(f"\nBLEU score: {bleu.score}")
    
    comet = evaluate.load('comet')
    comet_scores = comet.compute(
        predictions=translations,
        references=[ref[0] for ref in references],
        sources=[ex['translation']['de'] for ex in eval_subset]
    )
    print(f"COMET score: {comet_scores['mean_score']}")
    
    return {
        'translations': translations,
        'references': references,
        'bleu': bleu.score,
        'comet': comet_scores['mean_score']
    }

# Run debug evaluation
debug_results = evaluate_finetuned_model()