# 5. Final Fraining of NCG

## Notebook-Setup

In [1]:
import transformers
import torch
import pandas as pd
import datasets

from torch.utils.tensorboard import SummaryWriter

In [2]:
run_name = "finalTraining_v1"

# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"

# set max length for dataset
max_length = 2048
# max_length = 16000 # 16k tokens from paper -> https://arxiv.org/pdf/2308.12950.pdf -> needs to much memory

# set random seed for dataset shuffling
rand_seed = 42

# set export options
save_dataset = True
save_df = True

# dataset import folder
export_folder = "./dataset/" + run_name + "/"

## training Paths

# Tensorboard folder
tensorboard_logdir = "./runs"

# model save path
model_save_path = "./models/" + run_name + "/"

# model checkpoint path
model_checkpoint_path = "./checkpoints/" + run_name + "/"

## Training parameters

# set batch size per device
per_device_train_batch_size = 1

# set number of gradient accumulation steps -> number of updates steps to accumulate before performing a backward/update pass
gradient_accumulation_steps = 1

# create model checkpoint every x steps
save_steps=20

# Keep keep last x checkpoints
save_total_limit=5

# Enable mixed precision training -> hugh enabler for low VRAM training
fp16=True

# Log every x steps
logging_steps=50



In [3]:
### TRAINING HYPERPARAMETERS ###

## Lora
# lora rank
lora_r_value = 10

# lora alpha
lora_alpha_value = 30

# dropout for lora weights
lora_dropout = 0.05


## trainer
# Number of warmup steps for learning rate scheduler
warmup_steps=448

# set number of epochs
num_train_epochs = 6

# Learning rate
learning_rate=3.4e-5

In [4]:
### EVALUATION HYPERPARAMETERS ###

eval_steps = 50

per_device_eval_batch_size=1

gradient_checkpointing=True

eval_accumulation_steps=2

## Load dataset from disk

In [5]:
dataset = datasets.load_from_disk(export_folder + "train_dataset")

eval_dataset = datasets.load_from_disk(export_folder + "eval_dataset")

In [6]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 64
})

In [7]:
len(dataset["input_ids"][42])

512

## Prepare Training

In [8]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# add pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [9]:
# define data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # Set mlm=False for causal language modeling


In [10]:
writer = SummaryWriter()

### Load model

In [11]:
# load quaNtization config for 4bit quantization
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)


model = transformers.AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, low_cpu_mem_usage=True)
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
footprint = model.get_memory_footprint()
# format footprint to GB

# write model footprint to footprintfile.txt
with open("footprintfile.txt", "a") as f:
    f.write(f"\nThe Model {model_name} has a footprint of {footprint / 1024 / 1024 / 1024} GB ({footprint} bytes) with the following quantization config: {quantization_config}")

footprint / 1024 / 1024 / 1024

4.0046539306640625

In [13]:
### test inference
# prompt = "{ \"@class\" : \"nitrox.dlc.mirror.model.EnumModel\", \"typeName\" :"

# inputs = tokenizer(prompt, return_tensors="pt")

# outputs = model.generate(inputs.input_ids, max_length=300, do_sample=True, temperature=0.7)

# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [14]:
# load tensorboard extension for jupyter
%load_ext tensorboard

### Lora configuration

In [15]:
import peft

# lora config quantisation params
lora_config = peft.LoraConfig(
    r=lora_r_value,
    lora_alpha=lora_alpha_value,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

model.add_adapter(lora_config)

## Add eval metrics

In [16]:
import evaluate


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    # select only the first element of the tuple when element is a tuple
    if type(logits)==tuple:
        logits = logits[0]

    # select the argmax of the logits
    logits = logits.argmax(axis=-1)
    
    return logits

def decode_logits_labels(logits, labels, print_debugg=False):
    '''Decode logits and labels to text and cut them to the same length using the decoder.
    Set print_debugg to True to enable print outputs.
    '''
    ## Convert logits to text
    predicted_text = [tokenizer.decode(logit[logit < tokenizer.vocab_size], skip_special_tokens=True) for logit in logits]

    # concatenate predicted text to one string
    predicted_text = ''.join(predicted_text)

    ## Convert labels to text
    # Labels shape: (16, 512)
    label_text = [tokenizer.decode(label[label < tokenizer.vocab_size], skip_special_tokens=True) for label in labels]
    
    # concatenate label text to one string
    label_text = ''.join(label_text)

    # cut both to same length
    predicted_text = predicted_text[:len(label_text)]
    label_text = label_text[:len(predicted_text)]

    if print_debugg:
        print("🛠️ DEBUGG decode_logits_labels 🛠️")
        print(f"PREDICTED: {predicted_text}")
        print(f"LABEL: {label_text}")

    return predicted_text, label_text

def calcuate_rouge_in_compute_metrics(predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate rouge score for a given model and predicted text in the compute metrics function.
    '''

    # compute rouge score
    rouge = evaluate.load('rouge')

    scores = rouge.compute(predictions=predicted_text, references=label_text)


    if print_debugg:
        print("🛠️ DEBUGG calcuate_rouge_in_compute_metrics 🛠️")

        print(f"Rouge scores: {scores}")
        # {'rouge1': 0.04678232172323061, 'rouge2': 0.0, 'rougeL': 0.04687151585425679, 'rougeLsum': 0.04680462025598715}
    if return_long_form:
        return {"Rouge-1 f1": scores['rouge1'], "Rouge-2 f1": scores['rouge2'], "Rouge-L f1": scores['rougeL'], "Rouge-Lsum f1": scores['rougeLsum']}
    else:
        return {"Rouge-Lsum f1": scores['rougeLsum']}


def calculate_bleu_score(predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate bleu score for a given model and predicted text in the compute metrics function.
    '''

    bleu = evaluate.load("bleu")
    bleu_score = bleu.compute(predictions=[predicted_text], references=[[label_text]])

    if print_debugg:
        print("🛠️ DEBUGG calculate_bleu_score 🛠️")
        print(f"BLEU Score: {bleu_score}")

    
    if return_long_form:
        return bleu_score
    else:
        return {"BLEU Score": bleu_score['bleu']}

def calculate_perplexity_in_compute_metrics(model, predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate perplexity for a given model and predicted text in the compute metrics function.
    Hint: The model gets loaded every eval step and is also not loaded in quantized mode. -> Leads to high memory usage, slow evaluation times and unusable results when comparing it to the quantized model.
    '''

    # load metric from evalaute
    perplexity = evaluate.load('perplexity', module_type="metric")
    
    # compute perplexity
    scores = perplexity.compute(model_id=model, predictions=predicted_text)
    
    if print_debugg:
        print("🛠️ DEBUGG calculate_perplexity_in_compute_metrics 🛠️")
        print(f"Perplexity: {scores}")
    
    return {"Perplexity": scores}


def calculate_f1_score_in_compute_metrics(predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate f1 score for a given model and predicted text in the compute metrics function.
    Hint: The metric does not work yet, because the metric is not loaded correctly.
    '''

    # load metric from evalaute
    f1_score = evaluate.load('f1', module_type="metric")

    # compute f1 score
    scores_macro = f1_score.compute(predictions=predicted_text, references=label_text, average="macro")
    scores_micro = f1_score.compute(predictions=predicted_text, references=label_text, average="micro")
    scores_weighted = f1_score.compute(predictions=predicted_text, references=label_text, average="weighted")


    if print_debugg:
        print("🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️")
        print(f"F1 Score macro: {scores_macro}")
        print(f"F1 Score micro: {scores_micro}")
        print(f"F1 Score weighted: {scores_weighted}")

    return {"f1 macro":scores_macro['f1'], "f1 micro":scores_micro['f1'], "f1 weighted":scores_weighted['f1']}


def print_shape_logits_labels(logits, labels):
    '''Print shape and dtype of logits and labels.'''
    print("Logits shape:", logits.shape)
    print("Labels shape:", labels.shape)
    print("Logits dtype:", logits.dtype)
    print("Labels dtype:", labels.dtype)

def compute_metrics(eval_pred):
    """Compute evaluation metrics for a model's predictions.

    Parameters:
    eval_pred (tuple): A tuple containing the logits and labels.

    Returns:
    dict: A dictionary containing the computed evaluation metrics including ROUGE scores, BLEU score, and F1 score.
    """
    # set debugg to true to enable all print outputs
    debugg = True

    logits, labels = eval_pred

    if debugg:
        print("🛠️ DEBUGG print shapes of logits and labels befor conversion 🛠️")
        print_shape_logits_labels(logits, labels)

    # mask -100 tokens from labels
    mask = labels!=-100
    logits, labels = logits[mask], labels[mask]

    if debugg:
        print("🛠️ DEBUGG print shapes of logits and labels after conversion 🛠️")
        print_shape_logits_labels(logits, labels)

    predicted_text, label_text =  decode_logits_labels(logits, labels, print_debugg=debugg)

    eval_metrics = {}

    # calculate rouge score
    # rouge_scores = calcuate_rouge_in_compute_metrics(predicted_text, label_text, return_long_form=True, print_debugg=debugg)
    # eval_metrics.update(rouge_scores)

    # calculate bleu score
    bleu_score = calculate_bleu_score(predicted_text, label_text, return_long_form=True, print_debugg=debugg)
    eval_metrics.update(bleu_score)

    # calculate f1 score
    f1_score = calculate_f1_score_in_compute_metrics(logits, labels, return_long_form=True, print_debugg=debugg)
    eval_metrics.update(f1_score)

    # calculate perplexity -> Dont use this, it does not work properly. See Hint in function description.
    # perplexity = calculate_perplexity_in_compute_metrics(model=model_name, predicted_text=predicted_text, label_text=label_text, return_long_form=True, print_debugg=True)
    # eval_metrics.update(perplexity)
   
    return eval_metrics

### Define training parameters

In [18]:
training_args = transformers.TrainingArguments(
    output_dir=model_checkpoint_path,  # Output directory for model predictions and checkpoints
    overwrite_output_dir=True,  # Overwrite existing output
    num_train_epochs=num_train_epochs, # Number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,  # Batch size per device during training
    gradient_accumulation_steps=gradient_accumulation_steps,  # Number of updates steps to accumulate before performing a backward/update pass
    save_steps=save_steps,  # Create model checkpoint every x steps
    save_total_limit=save_total_limit,  # Keep keep last x checkpoints
    fp16=True,  # Enable mixed precision training -> hugh enabler for low VRAM training
    logging_dir=tensorboard_logdir,  # Directory for storing logs
    logging_steps=logging_steps,  # Log every x steps
    warmup_steps=warmup_steps,  # Number of warmup steps for learning rate scheduler
    learning_rate=learning_rate,  # Learning rate
    evaluation_strategy="steps",  # Evaluate every `logging_steps`
    eval_steps=eval_steps,  # Evaluate every x steps
    per_device_eval_batch_size=per_device_eval_batch_size,  # Batch size per device during evaluation
    gradient_checkpointing=gradient_checkpointing,  # Enable gradient checkpointing to save memory
    eval_accumulation_steps=eval_accumulation_steps,  # Accumulate evaluation steps
)


## Training

In [19]:
# define trainer for training 
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    callbacks=[transformers.integrations.TensorBoardCallback(writer)], # We log to Tensorboard
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
You are adding a <class 'transformers.integrations.integration_utils.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback


In [20]:
# start tensorboard for training logs
# %tensorboard --logdir=./runs

In [21]:
### TRAIN ###
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.




Step,Training Loss,Validation Loss,Rouge-lsum f1,Bleu score,F1 macro,F1 micro,F1 weighted
10,No log,0.814561,0.045645,0.774124,0.000992,0.000258,8.6e-05
20,No log,0.814421,0.047804,0.774469,0.00094,0.000258,8.1e-05


🛠️ DEBUGG calcuate_rouge_in_compute_metrics 🛠️
Rouge scores: {'rouge1': 0.0455782009543772, 'rouge2': 0.0, 'rougeL': 0.04562279801989029, 'rougeLsum': 0.045645096552646834}
🛠️ DEBUGG calculate_bleu_score 🛠️
BLEU Score: {'bleu': 0.7741242391761936, 'precisions': [0.8730271576851983, 0.8051763367463026, 0.743279761058171, 0.6873399715504979], 'brevity_penalty': 1.0, 'length_ratio': 1.067385035665503, 'translation_length': 7033, 'reference_length': 6589}
🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️
F1 Score macro: {'f1': 0.000992063492063492}
F1 Score micro: {'f1': 0.0002576655501159495}
F1 Score weighted: {'f1': 8.588851670531649e-05}




🛠️ DEBUGG calcuate_rouge_in_compute_metrics 🛠️
Rouge scores: {'rouge1': 0.04771460423634337, 'rouge2': 0.0, 'rougeL': 0.04782608695652174, 'rougeLsum': 0.047803790412486065}
🛠️ DEBUGG calculate_bleu_score 🛠️
BLEU Score: {'bleu': 0.7744686944579543, 'precisions': [0.8731152204836415, 0.8055199886185802, 0.743739328400683, 0.6877757222143162], 'brevity_penalty': 1.0, 'length_ratio': 1.0667678300455234, 'translation_length': 7030, 'reference_length': 6590}
🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️
F1 Score macro: {'f1': 0.0009398496240601503}
F1 Score micro: {'f1': 0.0002576655501159495}
F1 Score weighted: {'f1': 8.136806845766826e-05}




KeyboardInterrupt: 

### Save model

In [None]:
# saving model for backup -> not necessary we should have checkpoints from training
trainer.save_model(model_save_path)



### Test model after training

In [None]:
# ### test inference
# prompt = "{ \"@class\" : \"nitrox.dlc.mirror.model.EnumModel\", \"typeName\" :"

# inputs = tokenizer(prompt, return_tensors="pt")

# outputs = model.generate(inputs.input_ids, max_length=300, do_sample=True, temperature=0.7)

# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
### END OF NOTEBOOK ###