In [None]:
# If using Collab, uncomment these lines in order to install the correct modules

# !pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
# !pip install transformers datasets evaluate -q

In [1]:
# ----------------------------------------
# Load the datasets
# ----------------------------------------

from datasets import load_dataset

dataset = load_dataset("csv", data_files={
    "test" : "processed_data/ft_test_processed.csv", 
    "train" : "processed_data/ft_train_processed.csv", 
    "validation" : "processed_data/ft_valid_processed.csv"}
    )

In [None]:
# ----------------------------------------
# Set up WandB locally (if running through Google Collab, this step should not be necessary)
# ----------------------------------------
import wandb

wandb.init(project="RyanRussell_CodeT5_Finetuning")

In [4]:
import torch

In [None]:
# ----------------------------------------
# Load Pre-trained model & Tokenizer
# ----------------------------------------
from transformers import T5ForConditionalGeneration
from transformers import RobertaTokenizer
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

model_checkpoint = "Salesforce/codet5-small"

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(["<IF-STMT>"])

model.resize_token_embeddings(len(tokenizer))

# Since I'm running on a mac, I'm using mps. If running on Collab, it should select cuda automatically
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.backends.cuda.is_available() else "cpu")
model.to(device)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


T5ForConditionalGeneration(
  (shared): Embedding(32101, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32101, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
# ----------------------------------------
# Prepare the fine-tuning dataset
# ----------------------------------------

def preprocess_function(examples):
    inputs = examples["formatted_method"]
    targets = examples["if_statement"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4987 [00:00<?, ? examples/s]

Map:   0%|          | 0/49821 [00:00<?, ? examples/s]

Map:   0%|          | 0/4976 [00:00<?, ? examples/s]

In [7]:
# ----------------------------------------
# Define training arguments and Trainer
# ----------------------------------------

training_args = TrainingArguments(
    output_dir="./codet5-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=32, # Tells the model how many instances are provided as input at the same time.
                                   # E.g. with batch_size=2 it means the model is dealing with 2 different examples. The available hardware limits the batch size you can have.
                                   # If the GPU runs out of memory, you'll get an error. This indicates the batch size is too large
    per_device_eval_batch_size=16,
    num_train_epochs=5, # Means every element will be seen 5 times. If you have too many epochs, you run the risk of overfitting. This could happen also if the dataset is too small for the task you're trying to perform
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
    report_to="wandb"
)



trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [8]:
# ----------------------------------------
# Train the model
# ----------------------------------------
trainer.train()

# ----------------------------------------
# Evaluate on test set
# ----------------------------------------
metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test Evaluation Metrics:", metrics)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0431,0.036314
2,0.0384,0.035021
3,0.0357,0.034031
4,0.0331,0.03384
5,0.0338,0.033807


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Test Evaluation Metrics: {'eval_loss': 0.035492219030857086, 'eval_runtime': 45.6357, 'eval_samples_per_second': 109.278, 'eval_steps_per_second': 6.837, 'epoch': 5.0}


In [None]:
# ----------------------------------------
# Save the model
# ----------------------------------------

trainer.save_model("Trained_Model/fine_tuned_model")
tokenizer.save_pretrained("Trained_Model/tokenizer")

('Trained_Model/tokenizer/tokenizer_config.json',
 'Trained_Model/tokenizer/special_tokens_map.json',
 'Trained_Model/tokenizer/vocab.json',
 'Trained_Model/tokenizer/merges.txt',
 'Trained_Model/tokenizer/added_tokens.json')

In [11]:
# ----------------------------------------
# Test on a simple example
# ----------------------------------------
code = """def factorial(x):
    <IF_STMT>
        return 0
    else:
        return x * factorial(x-1)"""

inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True)

# Sent inputs to device
inputs = {key : value.to(device) for key, value in inputs.items()}

outputs = model.generate(**inputs, max_length=256)
print("Generated if statement:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generated if statement:
 if x == 0:
