### Imports

In [None]:
from benchmarking_metrics import benchmark_model
from datasets import Dataset
import pandas as pd
import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, M2M100ForConditionalGeneration, M2M100Tokenizer, AutoModelForSeq2SeqLM, EncoderDecoderModel, EncoderDecoderConfig,AutoConfig 
import numpy as np
from peft import PeftModel, PeftConfig
from tqdm import tqdm
import os

### Load test data

In [26]:
# Test dataset (may be any of the datasets in output_csvs)
test_df_path = os.path.join("output_csvs", "java_python.csv")
test_df = pd.read_csv(test_df_path)

# Store result of benchmarking in this excel file
result_path = os.path.join("excel_result_files", "java_python_benchmarking.xlsx")

In [27]:
test_dataset = Dataset.from_pandas(test_df)

### Load model for benchmarking

In [None]:

# The pre-trained model that was further fine-tuned 
model_name = "<base model name>"

# The path to the saved model
saved_model_path = "<path to saved model>"

from transformers import AutoTokenizer, AutoModel, EncoderDecoderModel, AutoConfig
from peft import PeftModel, PeftConfig

# Load the base models first
encoder = AutoModel.from_pretrained("microsoft/codebert-base")
decoder = AutoModel.from_pretrained("roberta-base")

# Create the encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "microsoft/codebert-base",
    "roberta-base",
    ignore_mismatched_sizes=True
)

# Load the LoRA configuration
peft_config = PeftConfig.from_pretrained(
    "/home/tobias-konieczny/Schreibtisch/kabul/model_training/codebert-lora-early-stopping-finetuned"
)

# Load the LoRA adapter weights
model = PeftModel.from_pretrained(model, 
    "/home/tobias-konieczny/Schreibtisch/kabul/model_training/codebert-lora-early-stopping-finetuned"
)

# Initialize tokenizers
encoder_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
decoder_tokenizer = AutoTokenizer.from_pretrained("roberta-base")



# use this to load the alireyamsh/small100 model
"""
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "en"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
"""

# use this to load gpt2 model
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(saved_model_path)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
"""

# use this to load codeT5 models which have been fine-tuned using the PEFT method
"""
peft_model_id = saved_model_path

config = PeftConfig.from_pretrained(peft_model_id)

model = T5ForConditionalGeneration.from_pretrained(
    config.base_model_name_or_path,
    device_map="cpu",
)

tokenizer = RobertaTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id)
model = model.merge_and_unload()
"""

# use this to load codeT5 models which have been fine-tuned without using the PEFT method
# this can also be used to load fine-tuned codebert models
"""
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = RobertaTokenizer.from_pretrained(model_name)
"""

### Preprocess test data

In [29]:
# preprocessing method, depends on model and used datsaset so please change depending on it
def preprocess_function1(examples):
    prefixes = [
        f"translate {lang1} to {lang2}:"
        for lang1, lang2 in zip(examples['input_language'], examples['target_language'])
    ]

    inputs = [
        prefix + src_code
        for prefix, src_code in zip(prefixes, examples['input_code'])
    ]
    targets = examples['target_code']
    
    # Tokenize inputs
    model_inputs = encoder_tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with encoder_tokenizer.as_target_tokenizer():
        labels = encoder_tokenizer(
            targets,
            max_length=512,
            padding="max_length",
            truncation=True
        )
    
    # Add labels to inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


In [None]:
tokenized_datasets_test1 = test_dataset.map(preprocess_function1, batched=True)
#tokenized_datasets_test2 = test_dataset.map(preprocess_function2, batched=True)

## Model evaluation

### Predict test data with the model

In [None]:
# use this one for codebert base:
# predict on test set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

preds = []
for test in tqdm(tokenized_datasets_test1, desc="Generating Predictions"):
    input_ids = torch.tensor([test["input_ids"]]).to(device)
    output = model.generate(input_ids=input_ids, max_length=50, do_sample=False)
    translated_code = decoder_tokenizer.decode(output[0], skip_special_tokens=True)
    preds.append(translated_code)

In [None]:
# use this one for all the other models
# predict on test set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

preds = []
for test in tqdm(tokenized_datasets_test1, desc="Generating Predictions"):
    input_ids = torch.tensor([test["input_ids"]]).to(device)
    output = model.generate(input_ids=input_ids, max_length=1024)
    translated_code = tokenizer.decode(output[0], skip_special_tokens=True)
    preds.append(translated_code)

### Create list of true lables for the test data

In [None]:
# save reference
refs = []
for test in tqdm(tokenized_datasets_test1, desc="Generating Predictions"):
    refs.append(decoder_tokenizer.decode(test["labels"], skip_special_tokens=True))

In [None]:
print(preds[0])
print(refs[0])

### Evaluate the model using the test data and the benchmarking metrics

In [None]:
# benchmark
hyperparameters = {
    "learning_rate": 1e-5,
    "batch_size": 2,
    "num_epochs": 5,
    "optimizer": "adamw_torch",
    "weight_decay": "todo",
    "max_grad_norm": "todo",
    "warmup_ratio": 0.1
}

benchmarks = benchmark_model(model_name=model_name, dataset_name="datasetname", training_size="# pairs", hyperparameters=hyperparameters,
                              preds=preds, refs=refs, result_path=result_path, save_results=True, lora=False, quantization=False, earlystopping=True)
print(benchmarks)