In [1]:
import transformers
import torch
import pandas as pd
import datasets
import peft
# from torch.utils.tensorboard import SummaryWriter

In [2]:
## define run name
run_name = "finalTraining_v1"
# run_name = "MLPC-2048-StarCoderBase7B"

# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"
# model_name = "bigcode/starcoderbase-7b"

# dataset import folder
export_folder = "./dataset/" + run_name + "/"

# model save path
model_save_path = "./models/" + run_name + "/"

# model checkpoint path
model_checkpoint_path = "./checkpoints/" + run_name + "/"

In [3]:
## Test loading model and inference with that model

# load quantization config for 4bit quantization -> must be same as training
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

# load model from model_save_path with quantization config
model = transformers.AutoModelForCausalLM.from_pretrained(model_save_path, quantization_config=quantization_config, low_cpu_mem_usage=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# add pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [5]:
# define data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # Set mlm=False for causal language modeling


In [6]:
test_dataset = datasets.load_from_disk(export_folder + "test_dataset")

dataset = datasets.load_from_disk(export_folder + "train_dataset")

In [7]:
import evaluate


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    # select only the first element of the tuple when element is a tuple
    if type(logits)==tuple:
        logits = logits[0]

    # select the argmax of the logits
    logits = logits.argmax(axis=-1)
    
    return logits

def decode_logits_labels(logits, labels, print_debugg=False):
    '''Decode logits and labels to text and cut them to the same length using the decoder.
    Set print_debugg to True to enable print outputs.
    '''
    ## Convert logits to text
    predicted_text = [tokenizer.decode(logit[logit < tokenizer.vocab_size], skip_special_tokens=True) for logit in logits]

    # concatenate predicted text to one string
    predicted_text = ''.join(predicted_text)

    ## Convert labels to text
    # Labels shape: (16, 512)
    label_text = [tokenizer.decode(label[label < tokenizer.vocab_size], skip_special_tokens=True) for label in labels]
    
    # concatenate label text to one string
    label_text = ''.join(label_text)

    # cut both to same length
    predicted_text = predicted_text[:len(label_text)]
    label_text = label_text[:len(predicted_text)]

    if print_debugg:
        print("🛠️ DEBUGG decode_logits_labels 🛠️")
        print(f"PREDICTED: {predicted_text}")
        print(f"LABEL: {label_text}")

    return predicted_text, label_text

def calcuate_rouge_in_compute_metrics(predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate rouge score for a given model and predicted text in the compute metrics function.
    '''

    # compute rouge score
    rouge = evaluate.load('rouge')

    scores = rouge.compute(predictions=predicted_text, references=label_text)


    if print_debugg:
        print("🛠️ DEBUGG calcuate_rouge_in_compute_metrics 🛠️")

        print(f"Rouge scores: {scores}")
        # {'rouge1': 0.04678232172323061, 'rouge2': 0.0, 'rougeL': 0.04687151585425679, 'rougeLsum': 0.04680462025598715}
    if return_long_form:
        return {"Rouge-1 f1": scores['rouge1'], "Rouge-2 f1": scores['rouge2'], "Rouge-L f1": scores['rougeL'], "Rouge-Lsum f1": scores['rougeLsum']}
    else:
        return {"rouge-lsum-f1": scores['rougeLsum']}


def calculate_bleu_score(predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate bleu score for a given model and predicted text in the compute metrics function.
    '''

    bleu = evaluate.load("bleu")
    bleu_score = bleu.compute(predictions=[predicted_text], references=[[label_text]])

    if print_debugg:
        print("🛠️ DEBUGG calculate_bleu_score 🛠️")
        print(f"BLEU Score: {bleu_score}")

    
    if return_long_form:
        return bleu_score
    else:
        return {"BLEU Score": bleu_score['bleu']}

def calculate_perplexity_in_compute_metrics(model, predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate perplexity for a given model and predicted text in the compute metrics function.
    Hint: The model gets loaded every eval step and is also not loaded in quantized mode. -> Leads to high memory usage, slow evaluation times and unusable results when comparing it to the quantized model.
    '''

    # load metric from evalaute
    perplexity = evaluate.load('perplexity', module_type="metric")
    
    # compute perplexity
    scores = perplexity.compute(model_id=model, predictions=predicted_text)
    
    if print_debugg:
        print("🛠️ DEBUGG calculate_perplexity_in_compute_metrics 🛠️")
        print(f"Perplexity: {scores}")
    
    return {"Perplexity": scores}


def calculate_f1_score_in_compute_metrics(predicted_text, label_text, return_long_form=False, print_debugg=False):
    ''' Calculate f1 score for a given model and predicted text in the compute metrics function.
    Hint: The metric does not work yet, because the metric is not loaded correctly.
    '''

    # load metric from evalaute
    f1_score = evaluate.load('f1', module_type="metric")

    # compute f1 score
    scores_macro = f1_score.compute(predictions=predicted_text, references=label_text, average="macro")
    scores_micro = f1_score.compute(predictions=predicted_text, references=label_text, average="micro")
    scores_weighted = f1_score.compute(predictions=predicted_text, references=label_text, average="weighted")


    if print_debugg:
        print("🛠️ DEBUGG calculate_f1_score_in_compute_metrics 🛠️")
        print(f"F1 Score macro: {scores_macro}")
        print(f"F1 Score micro: {scores_micro}")
        print(f"F1 Score weighted: {scores_weighted}")

    return {"f1 macro":scores_macro['f1'], "f1 micro":scores_micro['f1'], "f1 weighted":scores_weighted['f1']}


def print_shape_logits_labels(logits, labels):
    '''Print shape and dtype of logits and labels.'''
    print("Logits shape:", logits.shape)
    print("Labels shape:", labels.shape)
    print("Logits dtype:", logits.dtype)
    print("Labels dtype:", labels.dtype)

def compute_metrics(eval_pred):
    # set debugg to true to enable all print outputs
    debugg = False

    logits, labels = eval_pred

    if debugg:
        print("🛠️ DEBUGG print shapes of logits and labels befor conversion 🛠️")
        print_shape_logits_labels(logits, labels)

    # mask -100 tokens from labels
    mask = labels!=-100
    logits, labels = logits[mask], labels[mask]

    if debugg:
        print("🛠️ DEBUGG print shapes of logits and labels after conversion 🛠️")
        print_shape_logits_labels(logits, labels)

    predicted_text, label_text =  decode_logits_labels(logits, labels, print_debugg=debugg)

    eval_metrics = {}

    # calculate rouge score
    rouge_scores = calcuate_rouge_in_compute_metrics(predicted_text, label_text, return_long_form=True, print_debugg=debugg)
    eval_metrics.update(rouge_scores)

    # calculate bleu score
    bleu_score = calculate_bleu_score(predicted_text, label_text, return_long_form=True, print_debugg=debugg)
    eval_metrics.update(bleu_score)

    # calculate f1 score
    f1_score = calculate_f1_score_in_compute_metrics(logits, labels, return_long_form=True, print_debugg=debugg)
    eval_metrics.update(f1_score)

    # calculate perplexity -> Dont use this, it does not work properly. See Hint in function description.
    # perplexity = calculate_perplexity_in_compute_metrics(model=model_name, predicted_text=predicted_text, label_text=label_text, return_long_form=True, print_debugg=True)
    # eval_metrics.update(perplexity)
   
    return eval_metrics

In [8]:

# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"

# set max length for dataset
max_length = 2048
# max_length = 16000 # 16k tokens from paper -> https://arxiv.org/pdf/2308.12950.pdf -> needs to much memory

# set random seed for dataset shuffling
rand_seed = 42

# set export options
save_dataset = True
save_df = True

# dataset import folder
export_folder = "./dataset/" + "finalTraining_v1" + "/"

## training Paths

# Tensorboard folder
tensorboard_logdir = "./runs"

# model save path
model_save_path = "./models/" + run_name + "/"

# model checkpoint path
model_checkpoint_path = "./checkpoints/" + run_name + "/"

## Training parameters

# set batch size per device
per_device_train_batch_size = 1

# set number of gradient accumulation steps -> number of updates steps to accumulate before performing a backward/update pass
gradient_accumulation_steps = 1

# create model checkpoint every x steps
save_steps=50

# Keep keep last x checkpoints
save_total_limit=100

# Enable mixed precision training -> hugh enabler for low VRAM training
fp16=True

# Log every x steps
logging_steps=50

In [9]:
## Lora
# lora rank
lora_r_value = 10

# lora alpha
lora_alpha_value = 30

# dropout for lora weights
lora_dropout = 0.05


## trainer
# Number of warmup steps for learning rate scheduler
warmup_steps=448

# set number of epochs
num_train_epochs = 16

# Learning rate
learning_rate=3.4e-5

# %%
### EVALUATION HYPERPARAMETERS ###

eval_steps = 50

per_device_eval_batch_size=1

gradient_checkpointing=True

eval_accumulation_steps=2


In [10]:
training_args = transformers.TrainingArguments(
        output_dir=model_checkpoint_path,  # Output directory for model predictions and checkpoints
        overwrite_output_dir=True,  # Overwrite existing output
        num_train_epochs=num_train_epochs, # Number of training epochs
        per_device_train_batch_size=per_device_train_batch_size,  # Batch size per device during training
        gradient_accumulation_steps=gradient_accumulation_steps,  # Number of updates steps to accumulate before performing a backward/update pass
        # save_steps=save_steps,  # Create model checkpoint every x steps
        # save_total_limit=save_total_limit,  # Keep keep last x checkpoints
        fp16=True,  # Enable mixed precision training -> hugh enabler for low VRAM training
        # logging_dir=tensorboard_logdir,  # Directory for storing logs
        # logging_steps=logging_steps,  # Log every x steps
        warmup_steps=warmup_steps,  # Number of warmup steps for learning rate scheduler
        learning_rate=learning_rate,  # Learning rate
        evaluation_strategy="steps",  # Evaluate every `logging_steps`
        eval_steps=eval_steps,  # Evaluate every x steps
        per_device_eval_batch_size=per_device_eval_batch_size,  # Batch size per device during evaluation
        gradient_checkpointing=gradient_checkpointing,  # Enable gradient checkpointing to save memory
        eval_accumulation_steps=eval_accumulation_steps,  # Accumulate evaluation steps
    )


In [11]:
eval_trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

In [12]:
test_results = eval_trainer.evaluate()
print(test_results)

Trainer is attempting to log a value of "[0.9969825042969652, 0.993626161491241, 0.9901426125176636, 0.9867039499070591]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.030932707712054253, 'eval_Rouge-1 f1': 0.056495011311288285, 'eval_Rouge-2 f1': 0.0, 'eval_Rouge-L f1': 0.05649783208143375, 'eval_Rouge-Lsum f1': 0.05648827280482966, 'eval_bleu': 0.9918563846595103, 'eval_precisions': [0.9969825042969652, 0.993626161491241, 0.9901426125176636, 0.9867039499070591], 'eval_brevity_penalty': 1.0, 'eval_length_ratio': 1.000561485916329, 'eval_translation_length': 935544, 'eval_reference_length': 935019, 'eval_f1 macro': 0.0007592371021967591, 'eval_f1 micro': 2.071808896347401e-05, 'eval_f1 weighted': 1.9867474948267826e-05, 'eval_runtime': 1479.0957, 'eval_samples_per_second': 0.406, 'eval_steps_per_second': 0.406}


In [13]:
# get results from test evaluation
print(test_results)

{'eval_loss': 0.030932707712054253, 'eval_Rouge-1 f1': 0.056495011311288285, 'eval_Rouge-2 f1': 0.0, 'eval_Rouge-L f1': 0.05649783208143375, 'eval_Rouge-Lsum f1': 0.05648827280482966, 'eval_bleu': 0.9918563846595103, 'eval_precisions': [0.9969825042969652, 0.993626161491241, 0.9901426125176636, 0.9867039499070591], 'eval_brevity_penalty': 1.0, 'eval_length_ratio': 1.000561485916329, 'eval_translation_length': 935544, 'eval_reference_length': 935019, 'eval_f1 macro': 0.0007592371021967591, 'eval_f1 micro': 2.071808896347401e-05, 'eval_f1 weighted': 1.9867474948267826e-05, 'eval_runtime': 1479.0957, 'eval_samples_per_second': 0.406, 'eval_steps_per_second': 0.406}
