# 3. Hyperparameter Tuner

In this notebook we cunduct a hyperparameter tuning to find the best hyperparameters for the given circumstances.

## Notebook-Setup

In [1]:
# %pip install optuna
# %pip install jupyterlab jupyterlab-optuna
# %pip install optuna-dashboard
# %pip install plotly
# %pip install rouge_score

In [2]:
import transformers
import torch
import pandas as pd
import datasets
import plotly
import optuna
# from torch.utils.tensorboard import SummaryWriter

In [3]:
run_name = "NCG_RUN_1"

# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"

# set max length for dataset
max_length = 512
# max_length = 16000 # 16k tokens from paper -> https://arxiv.org/pdf/2308.12950.pdf -> needs to much memory

# set random seed for dataset shuffling
rand_seed = 42

# set export options
save_dataset = True
save_df = True

# dataset import folder
export_folder = "./dataset/" + run_name + "/"

## training Paths

# Tensorboard folder
tensorboard_logdir = "./runs"

# model save path
model_save_path = "./models/" + run_name + "/"

# model checkpoint path
model_checkpoint_path = "./checkpoints/" + run_name + "/"

## Training parameters

# set batch size per device
per_device_train_batch_size = 1

# set number of gradient accumulation steps -> number of updates steps to accumulate before performing a backward/update pass
gradient_accumulation_steps = 1

# create model checkpoint every x steps
save_steps=20

# Keep keep last x checkpoints
save_total_limit=5

# Enable mixed precision training -> hugh enabler for low VRAM training
fp16=True

# Log every x steps
logging_steps=50


# dropout for lora weights
lora_dropout = 0.05


In [5]:
### EVALUATION HYPERPARAMETERS ###

eval_steps = 150

per_device_eval_batch_size=1

gradient_checkpointing=True

eval_accumulation_steps=2

## Load dataset from disk

In [6]:
# import datasets from disk
dataset = datasets.load_from_disk(export_folder + "train_dataset")
eval_dataset = datasets.load_from_disk(export_folder + "eval_dataset")

In [7]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1921
})

In [8]:
len(dataset["input_ids"][42])

2048

## Prepare Training

In [9]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# add pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [10]:
# define data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # Set mlm=False for causal language modeling


## Eval Metrics

In [11]:
import evaluate


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    # select only the first element of the tuple when element is a tuple
    if type(logits)==tuple:
        logits = logits[0]

    # select the argmax of the logits
    logits = logits.argmax(axis=-1)
    
    return logits

def decode_logits_labels(logits, labels):
    '''Decode logits and labels to text and cut them to the same length using the decoder.
    Set print_debugg to True to enable print outputs.
    '''
    ## Convert logits to text
    predicted_text = [tokenizer.decode(logit[logit < tokenizer.vocab_size], skip_special_tokens=True) for logit in logits]

    # concatenate predicted text to one string
    predicted_text = ''.join(predicted_text)

    ## Convert labels to text
    # Labels shape: (16, 512)
    label_text = [tokenizer.decode(label[label < tokenizer.vocab_size], skip_special_tokens=True) for label in labels]
    
    # concatenate label text to one string
    label_text = ''.join(label_text)

    # cut both to same length
    predicted_text = predicted_text[:len(label_text)]
    label_text = label_text[:len(predicted_text)]

    return predicted_text, label_text

def calcuate_rouge_in_compute_metrics(predicted_text, label_text, return_long_form=False):
    ''' Calculate rouge score for a given model and predicted text in the compute metrics function.
    '''

    # compute rouge score
    rouge = evaluate.load('rouge')

    scores = rouge.compute(predictions=predicted_text, references=label_text)

    if return_long_form:
        return {"Rouge-1 f1": scores['rouge1'], "Rouge-2 f1": scores['rouge2'], "Rouge-L f1": scores['rougeL'], "Rouge-Lsum f1": scores['rougeLsum']}
    else:
        return {"rouge-lsum-f1": scores['rougeLsum']}


def calculate_bleu_score(predicted_text, label_text, return_long_form=False):
    ''' Calculate bleu score for a given model and predicted text in the compute metrics function.
    '''

    bleu = evaluate.load("bleu")
    bleu_score = bleu.compute(predictions=[predicted_text], references=[[label_text]])

    
    if return_long_form:
        return bleu_score
    else:
        return {"BLEU Score": bleu_score['bleu']}



def compute_metrics(eval_pred):

    # receive logits and labels from eval_pred
    logits, labels = eval_pred


    # mask -100 tokens from labels
    mask = labels!=-100
    logits, labels = logits[mask], labels[mask]


    predicted_text, label_text =  decode_logits_labels(logits, labels)

    eval_metrics = {}

    # calculate rouge score
    rouge_scores = calcuate_rouge_in_compute_metrics(predicted_text, label_text, return_long_form=True)
    eval_metrics.update(rouge_scores)

    # calculate bleu score
    bleu_score = calculate_bleu_score(predicted_text, label_text, return_long_form=True)
    eval_metrics.update(bleu_score)

   
    return eval_metrics

### Optuna Hyperparameter Tuning

In [13]:
import peft
import optuna


def objective(trial):

    ### define hyper parameters to tune
    ## Lora
    lora_r_value = trial.suggest_int("lora_r_value", 4, 16)
    lora_alpha_value = trial.suggest_int("lora_alpha_value", 4, 16)

    ## trainer
    warmup_steps = trial.suggest_int("warmup_steps", 200, 1200)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    learning_rate= trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    
    ## Load Model
    # load quaNtization config for 4bit quantization
    quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)  
    model = transformers.AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, low_cpu_mem_usage=True)


    # lora config quantisation params
    lora_config = peft.LoraConfig(
        r=lora_r_value,
        lora_alpha=lora_alpha_value,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model.add_adapter(lora_config)

    ## Training Arguments
    training_args = transformers.TrainingArguments(
        output_dir=model_checkpoint_path,  # Output directory for model predictions and checkpoints
        overwrite_output_dir=True,  # Overwrite existing output
        num_train_epochs=num_train_epochs, # Number of training epochs
        per_device_train_batch_size=per_device_train_batch_size,  # Batch size per device during training
        gradient_accumulation_steps=gradient_accumulation_steps,  # Number of updates steps to accumulate before performing a backward/update pass
        # save_steps=save_steps,  # Create model checkpoint every x steps
        # save_total_limit=save_total_limit,  # Keep keep last x checkpoints
        fp16=True,  # Enable mixed precision training -> hugh enabler for low VRAM training
        # logging_dir=tensorboard_logdir,  # Directory for storing logs
        # logging_steps=logging_steps,  # Log every x steps
        warmup_steps=warmup_steps,  # Number of warmup steps for learning rate scheduler
        learning_rate=learning_rate,  # Learning rate
        evaluation_strategy="steps",  # Evaluate every `logging_steps`
        eval_steps=eval_steps,  # Evaluate every x steps
        per_device_eval_batch_size=per_device_eval_batch_size,  # Batch size per device during evaluation
        gradient_checkpointing=gradient_checkpointing,  # Enable gradient checkpointing to save memory
        eval_accumulation_steps=eval_accumulation_steps,  # Accumulate evaluation steps
    )

    # define trainer for training 
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        # Try to not evaluate during training to speed up process
        # compute_metrics=compute_metrics,
        # preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

    # train model
    trainer.train()

    eval_trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

    # evaluate model
    eval_results = eval_trainer.evaluate()
    

    # return eval results
    return eval_results["eval_loss"], eval_results["eval_bleu"], eval_results["eval_Rouge-Lsum f1"]


### Create and start Hyperparameter Tuning

In [13]:
# define sampler
# sampler = optuna.samplers.TPESampler(seed=rand_seed)
## use NSGAIISampler sampler to optimize for multiple objectives
sampler = optuna.samplers.NSGAIIISampler(seed=rand_seed)

  sampler = optuna.samplers.NSGAIIISampler(seed=rand_seed)


In [14]:
study_name = run_name
storage_name = f"sqlite:///optuna/{study_name}.db"
study = optuna.create_study(directions=['minimize', 'maximize', 'maximize'], study_name=study_name, storage=storage_name, sampler=sampler)

[I 2024-04-09 12:35:02,395] A new study created in RDB with name: A6000_OptunaRun_2048-test


In [15]:
study.optimize(objective, n_trials=100)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


🛠️ DEBUGG print shapes of logits and labels befor conversion 🛠️
Logits shape: (16, 2048)
Labels shape: (16, 2048)
Logits dtype: int64
Labels dtype: int64
🛠️ DEBUGG print shapes of logits and labels after conversion 🛠️
Logits shape: (24245,)
Labels shape: (24245,)
Logits dtype: int64
Labels dtype: int64
🛠️ DEBUGG decode_logits_labels 🛠️
PREDICTED: ua.===[lIEVlishedEventTypeNames",
ledEventTypeNames":","lEvent":","setter":false,
nametype":"orgro:...Eventror.M.Event",","name":"get_get","getlaredIn":":":"VAR_typelaredByTypeName","return":":"VARBLIC","return":[],{"@class":"nitrox.dlc.mirror.model.ParameterModel","name":"VAR_name","typeName"@class":"nitrox.dlc.mirror.model.TypeionTypeainedmentTypeModel","nameName":"VAR_nameName","isTypeName"VARONE_DOMAIN","ised":["isSub":":false,"isOptionalContainer":false,"hasMapContainer":false,"hasMapContainer":false,"hasMapContainer":false,"hasType":":nullVAR_containerTypeName","containerTypeions":[],},],"returnType":{"@class":"nitrox.dlc.mirror.model.As

Trainer is attempting to log a value of "[0.9508945210585166, 0.9205143735731258, 0.8909234926847451, 0.8633334886538372]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2024-04-09 12:35:38,475] Trial 0 finished with values: [0.48541364073753357, 0.9058274544503314, 0.047757987386513275] and parameters: {'lora_r_value': 8, 'lora_alpha_value': 16, 'warmup_steps': 932, 'num_train_epochs': 3, 'learning_rate': 1.2854415975274441e-05}. 


🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️
F1 Score macro: {'f1': 0.001534330648254699}
F1 Score micro: {'f1': 0.00016498247061249742}
F1 Score weighted: {'f1': 5.999362567727179e-05}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


🛠️ DEBUGG print shapes of logits and labels befor conversion 🛠️
Logits shape: (16, 2048)
Labels shape: (16, 2048)
Logits dtype: int64
Labels dtype: int64
🛠️ DEBUGG print shapes of logits and labels after conversion 🛠️
Logits shape: (24245,)
Labels shape: (24245,)
Logits dtype: int64
Labels dtype: int64
🛠️ DEBUGG decode_logits_labels 🛠️
PREDICTED: ua.===[lIEVlishedEventTypeNames",
ledEventTypeNames":","lEvent":","setter":false,
nametype":"orgro:...Eventror.M.Event",","name":"get_get","getlaredIn":":":"VAR_typelaredByTypeName","return":":"VARBLIC","return":[],{"@class":"nitrox.dlc.mirror.model.ParameterModel","name":"VAR_name","typeName"@class":"nitrox.dlc.mirror.model.TypeionTypeainedmentTypeModel","nameName":"VAR_nameName","isTypeName"VARONE_DOMAIN","ised":["isSub":":false,"isOptionalContainer":false,"hasMapContainer":false,"hasMapContainer":false,"hasMapContainer":false,"hasType":":nullVAR_containerTypeName","containerTypeions":[],},],"returnType":{"@class":"nitrox.dlc.mirror.model.As

Trainer is attempting to log a value of "[0.9508945210585166, 0.9205143735731258, 0.8909234926847451, 0.8633334886538372]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2024-04-09 12:36:13,784] Trial 1 finished with values: [0.48541364073753357, 0.9058274544503314, 0.047757987386513275] and parameters: {'lora_r_value': 6, 'lora_alpha_value': 4, 'warmup_steps': 1067, 'num_train_epochs': 4, 'learning_rate': 3.12551431816761e-05}. 


🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️
F1 Score macro: {'f1': 0.001534330648254699}
F1 Score micro: {'f1': 0.00016498247061249742}
F1 Score weighted: {'f1': 5.999362567727179e-05}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


🛠️ DEBUGG print shapes of logits and labels befor conversion 🛠️
Logits shape: (16, 2048)
Labels shape: (16, 2048)
Logits dtype: int64
Labels dtype: int64
🛠️ DEBUGG print shapes of logits and labels after conversion 🛠️
Logits shape: (24245,)
Labels shape: (24245,)
Logits dtype: int64
Labels dtype: int64
🛠️ DEBUGG decode_logits_labels 🛠️
PREDICTED: ua.===[lIEVlishedEventTypeNames",
ledEventTypeNames":","lEvent":","setter":false,
nametype":"orgro:...Eventror.M.Event",","name":"get_get","getlaredIn":":":"VAR_typelaredByTypeName","return":":"VARBLIC","return":[],{"@class":"nitrox.dlc.mirror.model.ParameterModel","name":"VAR_name","typeName"@class":"nitrox.dlc.mirror.model.TypeionTypeainedmentTypeModel","nameName":"VAR_nameName","isTypeName"VARONE_DOMAIN","ised":["isSub":":false,"isOptionalContainer":false,"hasMapContainer":false,"hasMapContainer":false,"hasMapContainer":false,"hasType":":nullVAR_containerTypeName","containerTypeions":[],},],"returnType":{"@class":"nitrox.dlc.mirror.model.As

Trainer is attempting to log a value of "[0.9508945210585166, 0.9205143735731258, 0.8909234926847451, 0.8633334886538372]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
[I 2024-04-09 12:36:49,145] Trial 2 finished with values: [0.48541364073753357, 0.9058274544503314, 0.047757987386513275] and parameters: {'lora_r_value': 4, 'lora_alpha_value': 16, 'warmup_steps': 1033, 'num_train_epochs': 2, 'learning_rate': 1.3399549522183029e-05}. 


🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️
F1 Score macro: {'f1': 0.001534330648254699}
F1 Score micro: {'f1': 0.00016498247061249742}
F1 Score weighted: {'f1': 5.999362567727179e-05}


## Evaluate results in Optuna

In [14]:
# import optuna study
study_name = run_name
storage_name = f"sqlite:///optuna/{study_name}.db"
study = optuna.load_study(study_name=study_name, storage=storage_name)

In [15]:
# get best trails from study
study.best_trials

[FrozenTrial(number=2, state=TrialState.COMPLETE, values=[0.05471425876021385, 0.9876206785413549, 0.06232230276080042], datetime_start=datetime.datetime(2024, 4, 9, 22, 5, 30, 359980), datetime_complete=datetime.datetime(2024, 4, 10, 0, 26, 13, 613859), params={'lora_r_value': 4, 'lora_alpha_value': 16, 'warmup_steps': 1033, 'num_train_epochs': 2, 'learning_rate': 1.3399549522183029e-05}, user_attrs={}, system_attrs={'nsga3:generation': 0}, intermediate_values={}, distributions={'lora_r_value': IntDistribution(high=16, log=False, low=4, step=1), 'lora_alpha_value': IntDistribution(high=16, log=False, low=4, step=1), 'warmup_steps': IntDistribution(high=1200, log=False, low=200, step=1), 'num_train_epochs': IntDistribution(high=5, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=5e-05, log=True, low=1e-05, step=None)}, trial_id=3, value=None),
 FrozenTrial(number=6, state=TrialState.COMPLETE, values=[0.03727734833955765, 0.9903956177128287, 0.05231322970776793], datet

### Export Study to Data Frame

In [23]:
df = study.trials_dataframe()
df.head()

Unnamed: 0,number,values_0,values_1,values_2,datetime_start,datetime_complete,duration,params_learning_rate,params_lora_alpha_value,params_lora_r_value,params_num_train_epochs,params_warmup_steps,system_attrs_nsga3:generation,state
0,0,0.047838,0.988572,0.049686,2024-04-09 14:26:04.516440,2024-04-09 17:44:53.662875,0 days 03:18:49.146435,1.3e-05,16,8,3,932,0.0,COMPLETE
1,1,0.043519,0.989534,0.047869,2024-04-09 17:44:53.704285,2024-04-09 22:05:30.327186,0 days 04:20:36.622901,3.1e-05,4,6,4,1067,0.0,COMPLETE
2,2,0.054714,0.987621,0.062322,2024-04-09 22:05:30.359980,2024-04-10 00:26:13.613859,0 days 02:20:43.253879,1.3e-05,16,4,2,1033,0.0,COMPLETE
3,3,0.051484,0.987701,0.05146,2024-04-10 00:26:13.647323,2024-04-10 03:45:18.480474,0 days 03:19:04.833151,1.6e-05,7,6,3,725,0.0,COMPLETE
4,4,0.055351,0.986182,0.051536,2024-04-10 03:45:18.521370,2024-04-10 06:06:12.163497,0 days 02:20:53.642127,2.1e-05,5,11,2,492,0.0,COMPLETE
