# Evaluate the Results

In [16]:
import transformers
import torch
import pandas as pd
import datasets
import evaluate

from torch.utils.tensorboard import SummaryWriter

In [17]:
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [18]:
# define run name
run_name = "A6000_StartEnd_without_po"

# define model for tokenizer
model_name = "codellama/CodeLlama-7b-hf"

# dataset import folder
export_folder = "./dataset/" + run_name + "/"

# model save path
model_save_path = "./models/" + run_name + "/"

# model checkpoint path
model_checkpoint_path = "./checkpoints/" + run_name + "/"

# Tensorboard folder
tensorboard_logdir = "./runs"


In [19]:
## Test loading model and inference with that model

# load quantization config for 4bit quantization -> must be same as training
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

# load model from model_save_path with quantization config
model = transformers.AutoModelForCausalLM.from_pretrained(model_save_path, quantization_config=quantization_config, low_cpu_mem_usage=True)

# optional: load model from checkpoint
# model = transformers.AutoModelForCausalLM.from_pretrained("./output/bigRun/checkpoint-1000", quantization_config=quantization_config, low_cpu_mem_usage=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# add pad token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [21]:
# load tokenized test dataset
test_dataset = datasets.load_from_disk(export_folder + "test_dataset")

In [22]:
## Training parameters

# set number of epochs
num_train_epochs = 3

# set batch size per device
per_device_train_batch_size = 1

# set number of gradient accumulation steps -> number of updates steps to accumulate before performing a backward/update pass
gradient_accumulation_steps=5

# create model checkpoint every x steps
save_steps=20

# Keep keep last x checkpoints
save_total_limit=5

# Enable mixed precision training -> hugh enabler for low VRAM training
fp16=True

# Log every x steps
logging_steps=50


In [23]:
### TRAINING HYPERPARAMETERS ###

## Lora
# lora rank
lora_r_value = 8

# lora alpha
lora_alpha_value = 16

# dropout for lora weights
lora_dropout = 0.05

## trainer
# Number of warmup steps for learning rate scheduler
warmup_steps=800

# Learning rate
learning_rate=2e-5  

In [24]:
# define data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # Set mlm=False for causal language modeling


In [25]:
training_args = transformers.TrainingArguments(
    output_dir=model_checkpoint_path,  # Output directory for model predictions and checkpoints
    overwrite_output_dir=True,  # Overwrite existing output
    num_train_epochs=num_train_epochs, # Number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,  # Batch size per device during training
    gradient_accumulation_steps=gradient_accumulation_steps,  # Number of updates steps to accumulate before performing a backward/update pass
    save_steps=save_steps,  # Create model checkpoint every x steps
    save_total_limit=save_total_limit,  # Keep keep last x checkpoints
    fp16=True,  # Enable mixed precision training -> hugh enabler for low VRAM training
    logging_dir=tensorboard_logdir,  # Directory for storing logs
    logging_steps=logging_steps,  # Log every x steps
    warmup_steps=warmup_steps,  # Number of warmup steps for learning rate scheduler
    learning_rate=learning_rate,  # Learning rate
    evaluation_strategy="steps",  # Evaluate every `logging_steps`
    eval_steps=1,  # Evaluate every x steps
    per_device_eval_batch_size=1,
    gradient_checkpointing=True,
    debug=True,


)

In [26]:
import numpy as np
from torch.nn import functional as F
from nltk.translate.bleu_score import corpus_bleu
import nltk
from codebleu import calc_codebleu
from rouge import Rouge

def compute_metrics(eval_pred):
    torch.cuda.empty_cache()
    logits, labels = eval_pred
    # print("Min token id:", min(labels.flatten()))
    # print("Max token id:", max(labels.flatten()))

    mask = labels!=-100 # we dont need to calculate loss for -100 tokens -> padding tokens
    logits, labels = logits[mask], labels[mask]

    if isinstance(logits, np.ndarray):
        logits = torch.from_numpy(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.from_numpy(labels)

    # Ensure the tensors are on the same device and in the correct format
    logits = logits.to(labels.device).view(-1, logits.size(-1))
    labels = labels.view(-1)

    
    ### Calculate cross entropy
    loss = F.cross_entropy(logits, labels.long())

    # Calculate perplexity
    perplexity = torch.exp(loss)

    # print("Perplexity:", perplexity.item())

    ### Calculate BLEU score

    smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1

    references = [tokenizer.decode(labels.tolist()).split()]
    # print("References:", references)
    candidates = [tokenizer.decode(logits.argmax(dim=-1).tolist()).split()]
    # print("Candidates:", candidates)
    bleu_score = corpus_bleu(references, candidates, weights=(0.5, 0.5), smoothing_function=smoothing_function)

    # print("BLEU Score:", bleu_score)

    # Calculate codeBLEU score
    # codebleu_score = calc_codebleu(references, candidates, lang="JSON")
    # print(codebleu_score)
    
    # Calculate ROUGE score
    rouge = Rouge()
    scores = rouge.get_scores(' '.join(candidates[0]), ' '.join(references[0]))

    # return {"perplexity": perplexity.item(), "bleu_score": bleu_score, "codebleu_score": codebleu_score, "rouge_score": scores}
    # return {"perplexity": perplexity.item(), "bleu_score": bleu_score, "rouge_score_1_f1": scores[0]["rouge-1"]["f"], "rouge_score_1_p": scores[0]["rouge-1"]["p"], "rouge_score_1_r": scores[0]["rouge-1"]["r"], "rouge_score_2_f1": scores[0]["rouge-2"]["f"], "rouge_score_2_p": scores[0]["rouge-2"]["p"], "rouge_score_2_r": scores[0]["rouge-2"]["r"], "rouge_score_l_f1": scores[0]["rouge-l"]["f"], "rouge_score_l_p": scores[0]["rouge-l"]["p"], "rouge_score_l_r": scores[0]["rouge-l"]["r"]} 
    return {"perplexity": perplexity.item(), "bleu_score": bleu_score, "rouge_score_1_f1": scores[0]["rouge-1"]["f"], "rouge_score_2_f1": scores[0]["rouge-2"]["f"], "rouge_score_l_f1": scores[0]["rouge-l"]["f"]}
    # return {"perplexity": perplexity.item(), "bleu_score": bleu_score}

In [27]:
eval_trainer = transformers.Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [28]:
eval_results = eval_trainer.evaluate()


TypeError: argument of type 'bool' is not iterable

In [15]:
eval_results

NameError: name 'eval_results' is not defined


### ROUGE:

ROUGE-Score 1 -> 1-gram wörter Übereinstimmung
ROUGE-Score 2 -> 2-gram wörter Übereinstimmung
ROUGE-Score l -> längste n-gram Übereinstimmung

F1 Score: https://stephenallwright.com/interpret-f1-score/

|Score|Interp.|
|---|---|
|>0.9 | Very good|
|0.8 - 0.9	|Good|
|0.5 - 0.8	|OK|
|<0.5	|Not good|

### perplexity:
0 - 100 gut
Normalerweise <1000 

### BLEU Score
