### Imports

In [None]:
from datasets import Dataset
from peft import LoraConfig, get_peft_model 
import pandas as pd
import evaluate
import torch
from transformers import T5ForConditionalGeneration, RobertaTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, BitsAndBytesConfig, EarlyStoppingCallback
import numpy as np
import bitsandbytes as bnb
from sklearn.model_selection import train_test_split
from peft import PeftModel, PeftConfig
import os

### Data Preparation

In [2]:
# This function is used to create a dataframe for the V3 and V4 datasets
def create_dataframe_for_V3_or_V4_data(model_path):
    df = pd.read_csv(model_path)
    df.rename(columns={'py': 'python', 'cpp':'c++'}, inplace=True)

    df.drop('id', axis=1, inplace=True)

    list = []

    for index, row in df.iterrows():
        for col in df.columns:
            for lan in df.columns:
                if col != lan:
                    new_row = {
                        'input_language': col,
                        'input_code': row[col],
                        'target_language': lan,
                        'target_code': row[lan]
                    }
                    list.append(new_row)

    df = pd.DataFrame(list)

    # Create a new column 'target_column' combining 'input_language' and 'target_language'
    df['target_column'] = df['input_language'] + '_' + df['target_language']  
    return df  


In [3]:
# This function is used to create a dataframe for the V1 and V2 datasets
def create_dataframe_for_V2_data(model_path):
    df = pd.read_csv(model_path)
    
    # Create a new column 'target_column' combining 'input_language' and 'target_language'
    df['target_column'] = df['input_language'] + '_' + df['target_language']
    
    return df

In [None]:
# Load the data CSV file into a pandas DataFrame
model_path = os.path.join("datasets", "V3.csv")

df = create_dataframe_for_V2_data(model_path)
#df = create_dataframe_for_V3_or_V4_data(model_path)

test_size = 150  # Set to 150 as benchmarking would take too long with more test data  

# Perform the stratified split for the test set (we're specifying the test set size in number of samples).
train_val_df, test_df = train_test_split(
    df, test_size=test_size, stratify=df['target_column'], random_state=42
)

# Now, split the train+val set into training (remaining data) and validation (5% of the remaining data).
# Since we're working with a fixed test set size, the remaining data will be used for training and validation.
train_df, val_df = train_test_split(
    train_val_df, test_size=0.05, stratify=train_val_df['target_column'], random_state=42
)

# Convert the DataFrames back into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# display number of rows for train
print(f"Train dataset size: {len(train_dataset)}")

# Check the splits
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Optional: Display the class distribution of the datasets to ensure balance
print(f"Train class distribution:\n{train_df['target_column'].value_counts(normalize=True)}")
print(f"Validation class distribution:\n{val_df['target_column'].value_counts(normalize=True)}")
print(f"Test class distribution:\n{test_df['target_column'].value_counts(normalize=True)}")

### Setup for Finetuning

In [3]:

# The model that you want to train from the Hugging Face hub
model_name = "Salesforce/codet5-base"

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./own_results"

# The fine-tuned model will be saved to this path
saved_model_path  = os.path.join("training_codes", "<directory_name_for_saved_model>")

# Number of training epochs
num_train_epochs = 10

tokenizer = RobertaTokenizer.from_pretrained(model_name)


### LoRA and Quantization setup

This part is different for each model. For example, for BERT, we need to use the `BertConfig` class and for GPT-2, we need to use the `GPT2Config` class.
We decided to only display the setup for the codeT5 models, as the top three best performing fine-tuned models are codeT5 models.

The setup code for the other models is quite similar. But suitable tokenizer and model classes should be used.

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = False,
)

In [5]:
model = T5ForConditionalGeneration.from_pretrained(
    model_name, 
    quantization_config = bnb_config,
    device_map={"": 0})

In [6]:
lora_config = LoraConfig(
    r=8,                    # Rank (typically, smaller ranks like 4 or 8 work well)
    lora_alpha=16,          # Scaling factor (adjust this for task-specific performance)
    lora_dropout=0.1,       # Dropout to regularize LoRA parameters
    task_type="SEQ_2_SEQ_LM",
    bias="none",
)

model = get_peft_model(model, lora_config)

### Data preprocessing

In [7]:
# preprocessing method, depends on model and used datsaset so please change depending on it
def preprocess_function1(examples):
    prefixes = [
        f"translate {lang1} to {lang2}:"
        for lang1, lang2 in zip(examples['input_language'], examples['target_language'])
    ]

    inputs = [
        prefix + src_code
        for prefix, src_code in zip(prefixes, examples['input_code'])
    ]
    targets = examples['target_code']
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=512,
            padding="max_length",
            truncation=True
        )
    
    # Add labels to inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


In [None]:
tokenized_datasets_train1 = train_dataset.map(preprocess_function1, batched=True)
tokenized_datasets_test1 = test_dataset.map(preprocess_function1, batched=True)
tokenized_datasets_val1 = val_dataset.map(preprocess_function1, batched=True)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

### Model Training

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    logging_dir="./logs/early_stopping_lora_4bit_quant_codeT5-base-own-dataV4_10_epochs_batchsize_8",
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",
    save_total_limit=3,
    save_strategy="epoch",
    save_steps=2000,
    learning_rate=2e-4,  # play around with it
    per_device_train_batch_size=8,  # play around with it
    per_device_eval_batch_size=8, # play around with it
    gradient_accumulation_steps = 2,
    optim = "adamw_torch",
    weight_decay=0.001,  
    max_grad_norm = 0.3,
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,                   # speeds up the training
    num_train_epochs=num_train_epochs,
    fp16=False,
    bf16=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Seq2SeqTrainer(  
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train1,
    eval_dataset=tokenized_datasets_val1,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=data_collator,
)
# Train model
trainer.train()

In [21]:
model.save_pretrained(saved_model_path)

### Manual check of the performance of the model

In [25]:
peft_model_id = saved_model_path

config = PeftConfig.from_pretrained(peft_model_id)

model = T5ForConditionalGeneration.from_pretrained(
    config.base_model_name_or_path,
    device_map="cpu",
)

tokenizer = RobertaTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id)
model = model.merge_and_unload()

In [None]:
promt = tokenizer.decode(tokenized_datasets_test1[1]["input_ids"], skip_special_tokens=True) #"translate java to python: public int div(int a, int b) {\n    return a / b;\n}"
inputs = tokenizer(promt, return_tensors="pt", max_length=1024, truncation=True)

# Generate translation
outputs = model.generate(inputs["input_ids"],   max_length=1024)
translated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(translated_code)

In [None]:
print(tokenizer.decode(tokenized_datasets_test1[1]["input_ids"], skip_special_tokens=True))

In [None]:
output = model.generate(input_ids=torch.tensor([tokenized_datasets_test1[1]["input_ids"]]), max_length=1024)
translated_code = tokenizer.decode(output[0], skip_special_tokens=True)
print("Translated code:", translated_code)
print("Target code:", tokenizer.decode(tokenized_datasets_test1[1]["labels"], skip_special_tokens=True))

In [None]:
log_history = trainer.state.log_history

# Print a few log entries
for log in log_history:
    print(log)

In [None]:
import matplotlib.pyplot as plt

# Extract logged history
log_history = trainer.state.log_history

# Extract training and validation loss from log history
train_loss = []
eval_loss = []
steps_train = []
steps_eval = []

for entry in log_history:
    if 'loss' in entry:  # Training loss
        train_loss.append(entry['loss'])
        steps_train.append(entry['step'])
    if 'eval_loss' in entry:  # Validation loss
        eval_loss.append(entry['eval_loss'])
        steps_eval.append(entry['step'])

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(steps_train, train_loss, label="Training Loss", marker='o')
plt.plot(steps_eval, eval_loss, label="Validation Loss", marker='o')
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training and Validation Loss")
plt.legend()
plt.grid(True)
plt.show()
