In [None]:
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer,TrainingArguments, AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset, load_from_disk
from sacrebleu import corpus_bleu
import pandas as pd
from comet import download_model, load_from_checkpoint
import os
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from datasets import load_from_disk

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"


In [None]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#print(f"Using device: {device}")

In [None]:
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"  # Replace with your actual token


In [4]:
# Imports
import os
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments

subset_data_dir = "./data/wmt19_subset"
tokenized_data_dir = "./data/tokenized_subset"
model_dir = "./fine_tuned_llama"

llama_model_name = "meta-llama/Llama-2-7b-hf"  # Replace with your desired LLaMA model


In [6]:
print("Loading the WMT dataset...")
wmt_dataset = load_dataset("wmt19", "de-en", split="train")

print("Shuffling and reducing dataset size to 1 million samples...")
subset_dataset = wmt_dataset.shuffle(seed=42).select(range(1_000_000))
print(f"Subset size: {len(subset_dataset)}")

# Save the subset dataset for reuse
os.makedirs(subset_data_dir, exist_ok=True)
subset_dataset.save_to_disk(subset_data_dir)
print(f"Subset dataset saved to {subset_data_dir}")


Loading the WMT dataset...
Shuffling and reducing dataset size to 1 million samples...
Subset size: 1000000


Saving the dataset (0/1 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]

Subset dataset saved to ./data/wmt19_subset


In [5]:
# Reload the subset dataset
subset_dataset = load_from_disk(subset_data_dir)
print(f"Subset dataset loaded. Size: {len(subset_dataset)}")


Subset dataset loaded. Size: 1000000


In [6]:
# Load LLaMA tokenizer
print("Loading the tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(llama_model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


Loading the tokenizer...


In [7]:
def tokenize_function(batch):
    # Extract source and target texts
    sources = [item["de"] for item in batch["translation"]]
    targets = [item["en"] for item in batch["translation"]]
    
    # Tokenize sources and targets
    model_inputs = tokenizer(
        sources,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=512,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("Tokenizing the dataset...")
tokenized_dataset = subset_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["translation"],
    batch_size=32
)

os.makedirs(tokenized_data_dir, exist_ok=True)
tokenized_dataset.save_to_disk(tokenized_data_dir)
print(f"Tokenized dataset saved to {tokenized_data_dir}")


Tokenizing the dataset...


Saving the dataset (0/14 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]

Tokenized dataset saved to ./data/tokenized_subset


In [8]:
tokenized_dataset = load_from_disk(tokenized_data_dir)
print(f"Tokenized dataset loaded. Size: {len(tokenized_dataset)}")


Tokenized dataset loaded. Size: 1000000


In [9]:
validation_dataset = load_dataset("wmt19", "de-en", split="validation")
print(f"Validation dataset size: {len(validation_dataset)}")

tokenized_validation = validation_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["translation"],
    batch_size=32
)
print("Validation dataset tokenized.")


Validation dataset size: 2998
Validation dataset tokenized.


In [10]:
from peft import get_peft_model, LoraConfig, TaskType

# Load the LLaMA model
print("Loading the LLaMA model...")
model = AutoModelForCausalLM.from_pretrained(
    llama_model_name,
    device_map="auto",  # Automatically distribute across GPUs
    #quantization_config=BitsAndBytesConfig(load_in_8bit=True)
    load_in_8bit=True,  # Use 8-bit precision for reduced memory usage
    torch_dtype="auto"
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Sequence-to-sequence task
    inference_mode=False,  # Training mode
    r=16,  # LoRA rank
    lora_alpha=32,  # Scaling factor for LoRA updates
    lora_dropout=0.1  # Dropout to prevent overfitting
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("LoRA applied to the model.")


Loading the LLaMA model...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LoRA applied to the model.


In [11]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    eval_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs"
)


In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Starting training...")
trainer.train()

trainer.save_model(model_dir)
print(f"Model saved to {model_dir}")


  trainer = Trainer(


Starting training...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
#####New test

In [3]:

import os
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments

subset_data_dir = "./data/wmt19_subset"
tokenized_data_dir = "./data/tokenized_subset"
model_dir = "./fine_tuned_llama"

llama_model_name = "meta-llama/Llama-2-7b-hf"  # Replace with your desired LLaMA model


In [4]:
print("Loading the WMT dataset...")
wmt_dataset = load_dataset("wmt19", "de-en", split="train")

print("Shuffling and reducing dataset size to 1 hundred thousand samples...")
subset_dataset = wmt_dataset.shuffle(seed=42).select(range(100_000))
print(f"Subset size: {len(subset_dataset)}")

# Save the subset dataset for reuse
os.makedirs(subset_data_dir, exist_ok=True)
subset_dataset.save_to_disk(subset_data_dir)
print(f"Subset dataset saved to {subset_data_dir}")


Loading the WMT dataset...
Shuffling and reducing dataset size to 1 hundred thousand samples...
Subset size: 100000


Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Subset dataset saved to ./data/wmt19_subset


In [13]:
subset_dataset = load_from_disk(subset_data_dir)
print(f"Subset dataset loaded. Size: {len(subset_dataset)}")


Subset dataset loaded. Size: 100000


In [14]:
print("Loading the tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(llama_model_name)

# Configure tokenizer padding and truncation
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


Loading the tokenizer...


In [15]:
def build_causal_prompt(batch):

    sources = [ex["de"] for ex in batch["translation"]]
    targets = [ex["en"] for ex in batch["translation"]]
    
    combined_texts = []
    for src, tgt in zip(sources, targets):

        prompt = f"translate to english: {src}\n{tgt}"
        combined_texts.append(prompt)
    

    return {"text": combined_texts}


print("Building causal prompts...")
causal_dataset = subset_dataset.map(
    build_causal_prompt,
    batched=True,
    remove_columns=["translation"]  
)

def tokenize_causal(batch):
    """
    Tokenize the single 'text' field for a causal language model.
    We do NOT create 'labels' here. We'll let the DataCollatorForLanguageModeling
    handle shifting of tokens into labels.
    """
    return tokenizer(
        batch["text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

print("Tokenizing the dataset for causal LM...")
tokenized_dataset = causal_dataset.map(
    tokenize_causal,
    batched=True,
    remove_columns=["text"]
)

os.makedirs(tokenized_data_dir, exist_ok=True)
tokenized_dataset.save_to_disk(tokenized_data_dir)
print(f"Tokenized dataset saved to {tokenized_data_dir}")


Building causal prompts...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Tokenizing the dataset for causal LM...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Tokenized dataset saved to ./data/tokenized_subset


In [16]:

tokenized_dataset = load_from_disk(tokenized_data_dir)
print(f"Tokenized dataset loaded. Size: {len(tokenized_dataset)}")


Tokenized dataset loaded. Size: 100000


In [17]:
validation_dataset = load_dataset("wmt19", "de-en", split="validation")
print(f"Validation dataset size: {len(validation_dataset)}")

def build_causal_prompt_val(batch):
    sources = [ex["de"] for ex in batch["translation"]]
    targets = [ex["en"] for ex in batch["translation"]]
    combined_texts = []
    for src, tgt in zip(sources, targets):
        prompt = f"translate to english: {src}\n{tgt}"
        combined_texts.append(prompt)
    return {"text": combined_texts}

val_causal_dataset = validation_dataset.map(
    build_causal_prompt_val,
    batched=True,
    remove_columns=["translation"]
)

def tokenize_causal_val(batch):
    return tokenizer(
        batch["text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

tokenized_validation = val_causal_dataset.map(
    tokenize_causal_val,
    batched=True,
    remove_columns=["text"]
)
print("Validation dataset tokenized.")


Validation dataset size: 2998


Map:   0%|          | 0/2998 [00:00<?, ? examples/s]

Map:   0%|          | 0/2998 [00:00<?, ? examples/s]

Validation dataset tokenized.


In [18]:

print("Loading the LLaMA model...")

from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    llama_model_name,
    device_map="auto",
    torch_dtype="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

model.config.use_cache = False
model.gradient_checkpointing_enable()

print("LoRA adapter added to the model.")


Loading the LLaMA model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LoRA adapter added to the model.


In [19]:

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For causal LM
)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    eval_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs"
)


In [20]:

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation,
    # tokenizer=tokenizer,
    data_collator=data_collator
)

print("Starting training...")
trainer.train()

trainer.save_model(model_dir)
print(f"Model saved to {model_dir}")


Starting training...




RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [5]:
import os
import torch
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType


SUBSET_DATA_DIR = "./data/wmt19_subset"
TOKENIZED_DATA_DIR = "./data/tokenized_subset"
MODEL_DIR = "./fine_tuned_llama_lora"

BASE_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
MAX_LENGTH = 256           
NUM_TRAIN_EXAMPLES = 100000 


print("Loading WMT19 (de-en) training split...")
wmt_dataset = load_dataset("wmt19", "de-en", split="train")

print(f"Shuffling and reducing dataset size to {NUM_TRAIN_EXAMPLES} samples...")
subset_dataset = wmt_dataset.shuffle(seed=42).select(range(NUM_TRAIN_EXAMPLES))
print(f"Subset size: {len(subset_dataset)}")

os.makedirs(SUBSET_DATA_DIR, exist_ok=True)
subset_dataset.save_to_disk(SUBSET_DATA_DIR)
print(f"Subset dataset saved to {SUBSET_DATA_DIR}")

# Reload subset dataset
subset_dataset = load_from_disk(SUBSET_DATA_DIR)
print(f"Subset dataset reloaded. Size: {len(subset_dataset)}")


print("Loading the LLaMA tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


def build_prompt(src_text):

    return f"Translate from German to English:\n{src_text}\nAnswer:"

def tokenize_example(de_text, en_text):


    prompt = build_prompt(de_text)


    prompt_tokens = tokenizer(prompt, add_special_tokens=False)
    prompt_length = len(prompt_tokens["input_ids"])

 
    full_sequence = prompt + " " + en_text
    full_tokens = tokenizer(
        full_sequence,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )

    input_ids = full_tokens["input_ids"]
    attention_mask = full_tokens["attention_mask"]


    labels = input_ids.copy()
    for i in range(min(prompt_length, len(labels))):
        labels[i] = -100

    if all(label_id == -100 for label_id in labels):
        return None  

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

def tokenize_function(batch):
    """
    Tokenize a batch of examples, skipping any that are fully masked.
    """
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for translation in batch["translation"]:
        de_text = translation["de"]
        en_text = translation["en"]

        tokenized = tokenize_example(de_text, en_text)
        if tokenized is None:
        
            continue

        input_ids_list.append(tokenized["input_ids"])
        attention_mask_list.append(tokenized["attention_mask"])
        labels_list.append(tokenized["labels"])

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }


print("Tokenizing the training dataset...")
tokenized_dataset = subset_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=["translation"]
)

tokenized_dataset = tokenized_dataset.filter(
    lambda x: len(x["input_ids"]) > 0
)

os.makedirs(TOKENIZED_DATA_DIR, exist_ok=True)
tokenized_dataset.save_to_disk(TOKENIZED_DATA_DIR)
print(f"Tokenized dataset saved to {TOKENIZED_DATA_DIR}")

tokenized_dataset = load_from_disk(TOKENIZED_DATA_DIR)
print(f"Tokenized dataset loaded. Size: {len(tokenized_dataset)}")


print("Loading WMT19 (de-en) validation split...")
validation_dataset = load_dataset("wmt19", "de-en", split="validation")
print(f"Validation dataset size: {len(validation_dataset)}")

print("Tokenizing the validation dataset...")
tokenized_validation = validation_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=["translation"]
)

tokenized_validation = tokenized_validation.filter(
    lambda x: len(x["input_ids"]) > 0
)


#quant_config = BitsAndBytesConfig(
    #load_in_8bit=True,
    #llm_int8_enable_fp32_cpu_offload=False
#)


print("Loading base LLaMA-2-7B model")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto",
    #quantization_config=quant_config,
    torch_dtype=torch.float16  
)

#model.gradient_checkpointing_enable()  
model.enable_input_require_grads()     
#model.config.use_cache = False         

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  
    modules_to_save=None  
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True

data_collator = default_data_collator

training_args = TrainingArguments(
    output_dir="./results_lora",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    eval_steps=500,
    warmup_steps=200,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs_lora",
    gradient_checkpointing=False,
    remove_unused_columns=False,      
    ddp_find_unused_parameters=False, 
    optim="paged_adamw_32bit"        
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation,
    data_collator=data_collator
)


print("Starting LoRA")
trainer.train()

trainer.save_model(MODEL_DIR)
print(f"LoRA-adapted model saved to {MODEL_DIR}")

Loading WMT19 (de-en) training split...
Shuffling and reducing dataset size to 100000 samples...
Subset size: 100000


Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Subset dataset saved to ./data/wmt19_subset
Subset dataset reloaded. Size: 100000
Loading the LLaMA tokenizer...
Tokenizing the training dataset...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/99963 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/99963 [00:00<?, ? examples/s]

Tokenized dataset saved to ./data/tokenized_subset
Tokenized dataset loaded. Size: 99963
Loading WMT19 (de-en) validation split...
Validation dataset size: 2998
Tokenizing the validation dataset...
Loading base LLaMA-2-7B model


We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243
Starting LoRA


Step,Training Loss,Validation Loss
500,0.5135,0.57049
1000,0.5165,0.573536
1500,0.5025,0.574116
2000,0.5101,0.57327
2500,0.5227,0.573123
3000,0.52,0.570526
3500,0.5117,0.574462
4000,0.5007,0.578661
4500,0.4898,0.576442
5000,0.502,0.574716


LoRA-adapted model saved to ./fine_tuned_llama_lora


In [5]:
from datasets import load_dataset

wmt_dataset = load_dataset("wmt19", "de-en")
train_full = wmt_dataset["train"]
valid_official = wmt_dataset["validation"]


In [6]:
split_datasets = train_full.train_test_split(
    test_size=0.1,
    seed=42
)
train_subset = split_datasets["train"]
test_subset = split_datasets["test"]


In [12]:
import torch
import numpy as np
 
def get_gpu_memory_info():
    """
    Returns information about GPU memory usage.
    """
    if not torch.cuda.is_available():
        return "No GPU available"
    
    # Get the current GPU device
    device = torch.cuda.current_device()
    
    # Get memory information in bytes
    total_memory = torch.cuda.get_device_properties(device).total_memory
    reserved_memory = torch.cuda.memory_reserved(device)
    allocated_memory = torch.cuda.memory_allocated(device)
    free_memory = total_memory - reserved_memory
    
    # Convert to GB for better readability
    total_memory_gb = total_memory / (1024**3)
    reserved_memory_gb = reserved_memory / (1024**3)
    allocated_memory_gb = allocated_memory / (1024**3)
    free_memory_gb = free_memory / (1024**3)
    
    memory_info = {
        "total": total_memory_gb,
        "reserved": reserved_memory_gb,
        "allocated": allocated_memory_gb,
        "free": free_memory_gb
    }
    
    return memory_info

# Clear cache first
torch.cuda.empty_cache()

# Get and print memory info
memory_info = get_gpu_memory_info()

if isinstance(memory_info, dict):
    print(f"GPU Memory Information:")
    print(f"Total Memory: {memory_info['total']:.2f} GB")
    print(f"Reserved Memory: {memory_info['reserved']:.2f} GB")
    print(f"Allocated Memory: {memory_info['allocated']:.2f} GB")
    print(f"Free Memory: {memory_info['free']:.2f} GB")
else:
    print(memory_info)

GPU Memory Information:
Total Memory: 39.39 GB
Reserved Memory: 0.00 GB
Allocated Memory: 0.00 GB
Free Memory: 39.39 GB


In [1]:
import os
import shutil
import stat

def remove_directory(path):
    def remove_readonly(func, path, _):
        "Clear the readonly bit and reattempt removal"
        os.chmod(path, stat.S_IWRITE)
        func(path)
    
    try:
        print(f"Attempting to remove: {path}")
        shutil.rmtree(path, ignore_errors=False)
    except PermissionError:
        try:
            shutil.rmtree(path, onerror=remove_readonly)
        except Exception as e:
            print(f"Error removing directory: {str(e)}")
            
            for root, dirs, files in os.walk(path):
                for dir in dirs:
                    try:
                        os.chmod(os.path.join(root, dir), 0o777)
                    except Exception as e:
                        print(f"Error changing directory permissions: {str(e)}")
                for file in files:
                    try:
                        os.chmod(os.path.join(root, file), 0o777)
                    except Exception as e:
                        print(f"Error changing file permissions: {str(e)}")
            shutil.rmtree(path, ignore_errors=True)
    print(f"Attempted to remove folder at: {path}")

# Go up one level from notebooks to thesis_project, then target the data folder
path_to_data = os.path.abspath(os.path.join(os.getcwd(), '.', 'results'))
remove_directory(path_to_data)

Attempting to remove: /home/jovyan/thesis_project/notebooks/results
Attempted to remove folder at: /home/jovyan/thesis_project/notebooks/results


In [2]:
!nvidia-smi

Thu Jan  2 03:27:04 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.216.03             Driver Version: 535.216.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:25:00.0 Off |                    0 |
| N/A   33C    P0              38W / 250W |      0MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
#####new test

In [18]:
import os
import torch
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    default_data_collator,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType

In [7]:
SUBSET_DATA_DIR = "./data/wmt19_subset"
TOKENIZED_DATA_DIR = "./data/tokenized_subset"
MODEL_DIR = "./fine_tuned_llama_lora"

BASE_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
MAX_LENGTH = 256
NUM_TRAIN_EXAMPLES = 2000  


In [8]:
wmt_dataset = load_dataset("wmt19", "de-en", split="train")

subset_dataset = wmt_dataset.shuffle(seed=42).select(range(NUM_TRAIN_EXAMPLES))
os.makedirs(SUBSET_DATA_DIR, exist_ok=True)
subset_dataset.save_to_disk(SUBSET_DATA_DIR)


Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

def build_prompt(src_text):
    return f"Translate from German to English:\n{src_text}\nAnswer:"


In [10]:
def tokenize_example(de_text, en_text):
    prompt = build_prompt(de_text)
    prompt_tokens = tokenizer(prompt, add_special_tokens=False)
    prompt_length = len(prompt_tokens["input_ids"])

    full_sequence = prompt + " " + en_text
    full_tokens = tokenizer(
        full_sequence,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )
    input_ids = full_tokens["input_ids"]
    attention_mask = full_tokens["attention_mask"]

    labels = input_ids.copy()
    for i in range(min(prompt_length, len(labels))):
        labels[i] = -100

    if all(label_id == -100 for label_id in labels):
        return None

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


In [12]:
def tokenize_function(batch):
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for translation in batch["translation"]:
        de_text = translation["de"]
        en_text = translation["en"]

        tokenized = tokenize_example(de_text, en_text)
        if tokenized is None:
            continue

        input_ids_list.append(tokenized["input_ids"])
        attention_mask_list.append(tokenized["attention_mask"])
        labels_list.append(tokenized["labels"])

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list
    }


In [13]:
tokenized_dataset = subset_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=["translation"]
)

tokenized_dataset = tokenized_dataset.filter(
    lambda x: len(x["input_ids"]) > 0
)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1999 [00:00<?, ? examples/s]

In [14]:
validation_dataset = load_dataset("wmt19", "de-en", split="validation")
validation_dataset = validation_dataset.select(range(500))

tokenized_validation = validation_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    remove_columns=["translation"]
)

tokenized_validation = tokenized_validation.filter(
    lambda x: len(x["input_ids"]) > 0
)


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16
)
model.enable_input_require_grads()


We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    modules_to_save=None
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243


In [20]:
training_args = TrainingArguments(
    output_dir="./results_lora",
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    eval_steps=500,
    warmup_steps=200,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs_lora",
    gradient_checkpointing=False,
    remove_unused_columns=False,
    ddp_find_unused_parameters=False,
    optim="paged_adamw_32bit"
)




In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation,
    data_collator=default_data_collator
)

trainer.train()
trainer.save_model(MODEL_DIR)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
500,0.524,0.56427


In [22]:
# Evaluate zero-shot
trainer.evaluate(tokenized_validation)


{'eval_loss': 0.5758126378059387,
 'eval_runtime': 14.4155,
 'eval_samples_per_second': 34.685,
 'eval_steps_per_second': 4.37,
 'epoch': 3.0}

In [23]:
import math

perplexity = math.exp(0.5758)
print("Perplexity:", perplexity)


Perplexity: 1.778552800169017


In [24]:
from transformers import pipeline

translation_pipeline = pipeline(
    task="text-generation",
    model="./fine_tuned_llama_lora",   
    tokenizer="meta-llama/Llama-2-7b-hf",
    device_map="auto",
    torch_dtype=torch.float16
)

for i in range(5):
    src_text = validation_dataset[i]["translation"]["de"] 
    prompt = f"Translate from German to English:\n{src_text}\nAnswer:"
    
    output = translation_pipeline(prompt, 
                                  max_length=100, 
                                  num_return_sequences=1, 
                                  do_sample=False)  
    print("German:", src_text)
    print("Model Output:", output[0]["generated_text"])
    print("Reference:", validation_dataset[i]["translation"]["en"])
    print("---------------------------------------------------\n")


We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


German: MÃ¼nchen 1856: Vier Karten, die Ihren Blick auf die Stadt verÃ¤ndern
Model Output: Translate from German to English:
MÃ¼nchen 1856: Vier Karten, die Ihren Blick auf die Stadt verÃ¤ndern
Answer: MÃ¼nchen 1856: Vier Karten, die Ihren Blick auf die Stadt verÃ¤ndern
Translate from English to German:
Munich 1856: Four cards that change your view of the city
Answer: Munich 1856: Four cards that change
Reference: Munich 1856: Four maps that will change your view of the city
---------------------------------------------------

German: Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
Model Output: Translate from German to English:
Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
Answer: A mental institution where youths should meet today.
Translate from English to German:
A mental institution where youths should meet today.
Answer: Eine Irren-Anstalt, wo sich heute Jugendliche begegnen sollen.
Translate from English to German: A mental institution where youth

In [28]:
import evaluate

bleu = evaluate.load("sacrebleu")

# We'll collect model predictions and references
predictions = []
references = []
for i in range(len(tokenized_validation)):
    # Prepare the prompt
    src_text = validation_dataset[i]["translation"]["de"]
    prompt = f"Translate from German to English:\n{src_text}\nAnswer:"
    
    # Generate
    output = translation_pipeline( prompt,
    max_length=None,     # Disable the old max_length
    max_new_tokens=64,   # Generate up to 64 new tokens beyond the prompt
    do_sample=False)
    # The modelâ€™s text may include the prompt, so parse out only the answer portion if needed.
    translation_text = output[0]["generated_text"].split("Answer:")[-1].strip()
    
    predictions.append(translation_text)
    references.append([validation_dataset[i]["translation"]["en"]])  
    # note: sacrebleu expects a list of references

# Now compute BLEU
results = bleu.compute(predictions=predictions, references=references)
print("BLEU score:", results["score"])


BLEU score: 19.810710215391737


In [31]:
comet = evaluate.load("comet")
# COMET requires predictions, references, *and* sources
comet_scores = comet.compute(
    predictions=predictions,
    references=[ref[0] for ref in references],  # flatten the references list
    sources=[validation_dataset[i]["translation"]["de"] for i in range(len(tokenized_validation))]
)
print("COMET mean_score:", comet_scores["mean_score"])
seg_scores = comet_scores["scores"]
print("Segment-level COMET scores:", seg_scores)



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tok

COMET mean_score: 0.625564789891243
Segment-level COMET scores: [0.5995334386825562, 0.5374028086662292, 0.5215840935707092, 0.4270966053009033, 0.560679018497467, 0.5482152700424194, 0.4642297327518463, 0.706843912601471, 0.49775686860084534, 0.3937949240207672, 0.5705469250679016, 0.34640219807624817, 0.7867529988288879, 0.38848891854286194, 0.6351765990257263, 0.6475020051002502, 0.6894182562828064, 0.6289135217666626, 0.6403317451477051, 0.7983288764953613, 0.7446997761726379, 0.37292012572288513, 0.6914510726928711, 0.6341578960418701, 0.7429772019386292, 0.7821385860443115, 0.6365042328834534, 0.6752027273178101, 0.7674224972724915, 0.3401041030883789, 0.6171689629554749, 0.3358319103717804, 0.6423511505126953, 0.6021656394004822, 0.6700908541679382, 0.7483011484146118, 0.48467811942100525, 0.605262041091919, 0.6257359981536865, 0.4827680289745331, 0.6304999589920044, 0.5938538908958435, 0.7264378666877747, 0.7197814583778381, 0.5844296813011169, 0.43793925642967224, 0.5207476019