In [1]:
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,TrainingArguments,set_seed
from peft import LoraConfig, PeftModel
import torch
from datasets import load_dataset
from trl import SFTTrainer

import wandb
import os

hf_token = "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"
wandb_key = "7ea086a098e40728fdf48b616051776a17daf566"
os.environ["WANDB_PROJECT"] = "MLOps-Project"

#monitering login
wandb.login(key=wandb_key)

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmagn3144[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/dayman/.netrc


True

In [2]:
def reload_model(model_name, dataset, r, epochs, lr):
    # Load base model(code-llama-7b) and tokenizer
    bnb_config = BitsAndBytesConfig(
        load_in_4bit= True,
        bnb_4bit_quant_type= "nf4",
        bnb_4bit_compute_dtype= torch.bfloat16,
        bnb_4bit_use_double_quant= False,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map={"": 0}
    )
    model.config.use_cache = False # silence the warnings. Please re-enable for inference!
    model.config.pretraining_tp = 1

    # Load LLaMA tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_eos_token = True
    tokenizer.add_bos_token, tokenizer.add_eos_token

    peft_config = LoraConfig(
        lora_alpha = 8,
        lora_dropout = 0.1,
        r = r,
        bias = "none",
        task_type = "CAUSAL_LM",
        layers_to_transform = [i for i in range(10, 32)]
    )

    training_arguments = TrainingArguments(
        output_dir= "./results",
        num_train_epochs= epochs, # 1
        per_device_train_batch_size= 2,
        gradient_accumulation_steps= 2,
        optim = "paged_adamw_8bit",
        save_steps= 100,
        logging_steps= 10, # 10,
        learning_rate= lr,
        weight_decay= 0.001,
        fp16= False,
        bf16= False,
        max_grad_norm= 0.3,
        max_steps= -1,
        warmup_ratio= 0.2, # 0.3
        group_by_length= True,
        lr_scheduler_type= "linear", # "constant"
        report_to="wandb",
        run_name=f"r={r} epochs={epochs} lr={lr}"
    )

    # Setting sft parameters
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        max_seq_length= None,
        dataset_text_field="premise",
        tokenizer=tokenizer,
        args=training_arguments,
        packing= False,
    )

    print(f"Hyperparameters: r = {r}, epochs = {epochs}")
    print("Ready to train")

    return trainer, model

def save_model(trainer, model, save_directory, finetune_name):
    # Save the fine-tuned model in directory
    trainer.model.save_pretrained(save_directory + "/" + finetune_name)
    wandb.finish()
    model.config.use_cache = True
    model.eval()

def load_finetuned_model(base_model_name, model_directory, finetune_name):
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map= {"": 0})

    model = PeftModel.from_pretrained(base_model, model_directory + "/" + finetune_name)
    model = model.merge_and_unload()

    # Reload tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

def upload_to_huggingface(model, tokenizer, finetuned_model_name):
    # Upload model to huggingface
    model.push_to_hub(finetuned_model_name, use_auth_token=hf_token)
    tokenizer.push_to_hub(finetuned_model_name, use_auth_token=hf_token)

In [3]:
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"
dataset_name = "snli"
finetuned_model_name = "magnus42/MLOps-Projekt"
save_directory = "models" #"/work3/s204164/LLAMA2_Finetuning/trained_models/final_model"
train_dataset = load_dataset(dataset_name, split="train")

lr = 0.001
epochs = 0.01
r = 16
seed = 42

set_seed(seed)
finetune_name = f"finetuned_lr{lr}_e{epochs}_r{r}_seed{seed}"
trainer, model = reload_model(model_name, train_dataset, r, epochs, lr)
trainer.train()
save_model(trainer, model, save_directory, finetune_name)
upload_to_huggingface(model_name, save_directory, finetune_name)
# Clear the memory footprint
del model, trainer
torch.cuda.empty_cache()

print("This script has finished")

model.safetensors.index.json: 100%|██████████| 28.1k/28.1k [00:00<00:00, 48.7MB/s]
model-00001-of-00014.safetensors: 100%|██████████| 981M/981M [00:35<00:00, 27.3MB/s]
model-00002-of-00014.safetensors: 100%|██████████| 967M/967M [00:35<00:00, 27.3MB/s]
model-00003-of-00014.safetensors: 100%|██████████| 967M/967M [00:35<00:00, 27.2MB/s]
model-00004-of-00014.safetensors: 100%|██████████| 990M/990M [00:36<00:00, 27.0MB/s]
model-00005-of-00014.safetensors: 100%|██████████| 944M/944M [00:35<00:00, 26.9MB/s]
model-00006-of-00014.safetensors: 100%|██████████| 990M/990M [00:38<00:00, 25.4MB/s]
model-00007-of-00014.safetensors: 100%|██████████| 967M/967M [00:35<00:00, 27.2MB/s]
model-00008-of-00014.safetensors: 100%|██████████| 967M/967M [00:35<00:00, 27.2MB/s]
model-00009-of-00014.safetensors: 100%|██████████| 990M/990M [00:35<00:00, 27.6MB/s]
model-00010-of-00014.safetensors: 100%|██████████| 944M/944M [00:34<00:00, 27.2MB/s]
model-00011-of-00014.safetensors: 100%|██████████| 990M/990M [00:36

Hyperparameters: r = 16, epochs = 0.01
Ready to train


  0%|          | 0/1376 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacty of 7.92 GiB of which 114.38 MiB is free. Including non-PyTorch memory, this process has 6.77 GiB memory in use. Of the allocated memory 6.04 GiB is allocated by PyTorch, and 254.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF