### Imports

In [None]:
import datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import numpy as np

### Data Preparation

In [None]:

code_translation_dataset = datasets.load_dataset("NTU-NLP-sg/xCodeEval", 'code_translation')

print(code_translation_dataset)


In [None]:
code_translation_dataset.save_to_disk('./code_translation_dataset')

In [None]:
print(code_translation_dataset['train'][4380574]['lang_cluster'])


### Code Pairing

We must create code-code pairs in order to train our model. For that we use the method described in https://huggingface.co/datasets/NTU-NLP-sg/xCodeEval/discussions/1

In [2]:
# In your training environment
from datasets import load_from_disk
import itertools
import pandas as pd
import evaluate
import torch

In [4]:
# The NTU dataset was delivert together with all the models as it is 20 GB in size
code_translation_dataset = load_from_disk('directory/to/ntu/dataset')

In [5]:

# create a pandas dataframe
dataframe_code_translation_train = code_translation_dataset['train'].shuffle(seed=42).select(range(50000)).to_pandas()
dataframe_code_translation_test = code_translation_dataset['titan'].shuffle(seed=42).select(range(7000)).to_pandas()
dataframe_code_translation_val = code_translation_dataset['compact'].shuffle(seed=42).select(range(5000)).to_pandas()
dataframe_code_translation_val_small = code_translation_dataset['compact_small'].to_pandas()

# group by src_uid
grouped_train = dataframe_code_translation_train.groupby('src_uid')
grouped_test = dataframe_code_translation_test.groupby('src_uid')
grouped_val = dataframe_code_translation_val.groupby('src_uid')
grouped_val_small = dataframe_code_translation_val_small.groupby('src_uid')


In [None]:
# Calculate the number of unique lang_clusters per src_uid
lang_cluster_counts = dataframe_code_translation_train.groupby('src_uid')['lang_cluster'].nunique()

# Display the distribution of unique lang_clusters per group
print(lang_cluster_counts.value_counts())


In [None]:
#grouped.get_group(code_translation_dataset['compact']['src_uid'][0])

In [None]:
def create_pairs(grouped):
    result = []

    languages = grouped['lang_cluster'].unique()

    for (lang_cluster_1, lang_cluster_2) in itertools.permutations(languages, 2):
        
        lang_cluster_1_rows = grouped[grouped['lang_cluster'] == lang_cluster_1]
        lang_cluster_2_rows = grouped[grouped['lang_cluster'] == lang_cluster_2]

         # Pair each row from lang_cluster_1 with each row from lang_cluster_2
        for _, row1 in lang_cluster_1_rows.iterrows():
            for _, row2 in lang_cluster_2_rows.iterrows():
                result.append({
                    'src_uid': row1['src_uid'],  # src_uid is the same for both rows in the pair
                    'input_language': lang_cluster_1,
                    'input_code': row1['source_code'],
                    'target_language': lang_cluster_2,
                    'target_code': row2['source_code']
                })
    
    return result

"""
for name, group in grouped:
    print(group.columns)
    break
"""    
    
    

In [None]:
# Apply the function to each group
grouped_pairs_train = grouped_train.apply(create_pairs)
grouped_pairs_test = grouped_test.apply(create_pairs)
grouped_pairs_val = grouped_val.apply(create_pairs)
grouped_pairs_val_small = grouped_val_small.apply(create_pairs)

In [None]:
# Sum the lengths of all lists in the Series
num_rows = sum(len(pairs) for pairs in grouped_pairs_train)

print(f"Total number of pairs: {num_rows}")



In [10]:
flattened_df_train = pd.DataFrame([item for sublist in grouped_pairs_train for item in sublist])
flattened_df_test = pd.DataFrame([item for sublist in grouped_pairs_test for item in sublist])
flattened_df_val = pd.DataFrame([item for sublist in grouped_pairs_val for item in sublist])
flattened_df_val_small = pd.DataFrame([item for sublist in grouped_pairs_val_small for item in sublist])

In [None]:
columns = ['input_language', 'input_code', 'target_language', 'target_code']

# Remove the 'src_uid' column from each DataFrame if it exists
flattened_df_train = flattened_df_train.drop(columns=['src_uid'], errors='ignore')
flattened_df_test = flattened_df_test.drop(columns=['src_uid'], errors='ignore')
flattened_df_val = flattened_df_val.drop(columns=['src_uid'], errors='ignore')
flattened_df_val_small = flattened_df_val_small.drop(columns=['src_uid'], errors='ignore')

allowed_pairs = [
    ('Java', 'Kotlin'),
    ('Python', 'Kotlin'),
    ('C', 'Kotlin'),
    ('C++', 'Kotlin'),
    ('C#', 'Kotlin')
]

allowed_pairs_set = set(allowed_pairs)

# Concatenate all DataFrames into one
combined_df = pd.concat([flattened_df_train, flattened_df_test, flattened_df_val, flattened_df_val_small], ignore_index=True)

combined_df

# Filter the rows where (input_language, target_language) is in the allowed pairs
filtered_df = combined_df[combined_df.apply(lambda row: (row['input_language'], row['target_language']) in allowed_pairs_set, axis=1)]

filtered_df.to_csv('filtered_combined_data.csv', index=False)

In [None]:
flattened_df_train

In [None]:
flattened_df_train.groupby('src_uid').get_group(flattened_df_train['src_uid'][1])

### Setup for Finetuning

In [75]:

# The model that you want to train from the Hugging Face hub
model_name = "Salesforce/codet5-small"

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./NTU_results"

saved_model_path = "./code_translation_model_2_epochs_50k_train"

# Number of training epochs
num_train_epochs = 2

tokenizer = AutoTokenizer.from_pretrained(model_name)

# TODO: try quantization
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map={"": 0})

### Preprocessing

In [66]:
# preprocessing function for correct input format
def preprocess_function(examples):
    prefixes = [
        f"translate {lang1} to {lang2}:"
        for lang1, lang2 in zip(examples['lang_cluster_1'], examples['lang_cluster_2'])
    ]
    inputs = [
        prefix + src_code
        for prefix, src_code in zip(prefixes, examples['src_code_1'])
    ]
    targets = examples['src_code_2']
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=512,
            padding="max_length",
            truncation=True
        )
    
    # Add labels to inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


In [67]:
ds_train = Dataset.from_pandas(flattened_df_train)
ds_test = Dataset.from_pandas(flattened_df_test)
ds_val = Dataset.from_pandas(flattened_df_val)
ds_val_small = Dataset.from_pandas(flattened_df_val_small)

In [None]:
ds_train

In [None]:
tokenized_datasets_train = ds_train.map(preprocess_function, batched=True)
tokenized_datasets_test = ds_test.map(preprocess_function, batched=True)
tokenized_datasets_val = ds_val.map(preprocess_function, batched=True)
tokenized_datasets_val_small = ds_val_small.map(preprocess_function, batched=True)


In [None]:
tokenized_datasets_train

In [None]:
tokenizer.decode(tokenized_datasets_train[0]['input_ids'], skip_special_tokens=True)

In [72]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [73]:
# for eval during training
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### Model fine-tuning

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,  # play around with it
    per_device_train_batch_size=4,  # play around with it
    per_device_eval_batch_size=4, # play around with it
    gradient_accumulation_steps = 2,
    optim = "paged_adamw_32bit", 
    weight_decay=0.001,  
    max_grad_norm = 0.3,
    max_steps = -1,
    save_total_limit=3,
    warmup_ratio = 0.03,
    group_by_length = True,                   # speeds up the training
    num_train_epochs=num_train_epochs,
    fp16=False,
    report_to = "tensorboard",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_val_small,
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
)
# Train model
trainer.train()
model.save_pretrained(saved_model_path)

### First manuel check of the fine-tuned model output 

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(saved_model_path)


In [None]:
print(type(tokenized_datasets_test[0]['input_ids']))

In [None]:
print(tokenized_datasets_test[0]["lang_cluster_1"])

In [None]:
output = model.generate(input_ids=torch.tensor([tokenized_datasets_test[0]["input_ids"]]), max_length=1024)
translated_code = tokenizer.decode(output[0], skip_special_tokens=True)
print("Translated code:", translated_code)
print("Target code:", tokenizer.decode(tokenized_datasets_test[0]["labels"], skip_special_tokens=True))