# Fine-tuning CodeT5 for Java Code Summarization

**INSTALL LIBRARIES**
----------------------
----------------------
----------------------
----------------------

In [None]:

!pip install transformers datasets evaluate rouge_score bert_score --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00

In [None]:
pip install --upgrade transformers



**MOUNT DRIVE**
----------------------
----------------------
----------------------
----------------------

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**LOAD HF DATASET**
----------------------
----------------------
----------------------
----------------------

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

dataset = load_dataset("code_x_glue_ct_code_to_text", "java")

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.38M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/164923 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5183 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10955 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

**PREPROCESS DATASET**
----------------------
----------------------
----------------------
----------------------

In [None]:
label_pad_token_id = -100
max_input_length = 512
max_target_length = 128

# Normalize whitespace
# Cleans up extra spaces by collapsing multiple spaces/tabs/newlines into a single space and trimming leading/trailing spaces.
def normalize_whitespace(text: str):
    return " ".join(text.strip().split())

# Preprocessing function
def preprocess(example):
    code = normalize_whitespace(example["code"])
    summary = normalize_whitespace(example["docstring"])

    model_inputs = tokenizer(code, max_length=max_input_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(summary, max_length=max_target_length, padding="max_length", truncation=True)

    # Replaces padding tokens with ID -100 which ensures that they are not considered while computing loss
    labels["input_ids"] = [(label if label != tokenizer.pad_token_id else label_pad_token_id) for label in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
# Apply preprocessingon all dataset splits
train_dataset = dataset["train"].map(preprocess, batched=True, remove_columns=["code", "docstring"])
val_dataset = dataset["validation"].map(preprocess, batched=True, remove_columns=["code", "docstring"])
test_dataset = dataset["test"].map(preprocess, batched=True, remove_columns=["code", "docstring"])

**TRAIN MODEL**
----------------------
----------------------
----------------------
----------------------

In [None]:
steps_per_epoch = 20616
output_dir = "/content/drive/MyDrive/codet5_checkpoints"  # to save model checkpoints

# Training configuration for Seq2Seq model with checkpointing every quarter epoch, logging, and mixed-precision (if GPU available)
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="steps",
    save_steps=steps_per_epoch // 4,  # Save halfway through epoch
    save_total_limit=4,               # Keep last 4 checkpoints (or more if needed)

    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    logging_steps=100,
    report_to="none",
)

TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Resume training from checkpoint specified
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/codet5_checkpoints/checkpoint-41232")


**UNIT TESTS**
----------------------
----------------------
----------------------
----------------------

In [None]:
import pytest
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")

# Sample input
example = {
    "code": "public  int  add ( int a ,  int b )   { return a + b; }",
    "docstring": " Adds    two  integers.  "
}

def test_whitespace_normalization():
    cleaned_code = normalize_whitespace(example["code"])
    cleaned_doc = normalize_whitespace(example["docstring"])
    assert "  " not in cleaned_code
    assert cleaned_code.startswith("public int add")
    assert cleaned_doc == "Adds two integers."

def test_tokenization_keys():
    result = preprocess(example)
    assert "input_ids" in result
    assert "attention_mask" in result
    assert "labels" in result

def test_input_truncation():
    long_code = "int a = 0; " * 1000  # very long code
    example_long = {**example, "code": long_code}
    result = preprocess(example_long)
    assert len(result["input_ids"]) == max_input_length

def test_label_padding_masking():
    result = preprocess(example)
    assert label_pad_token_id in result["labels"]
    assert tokenizer.pad_token_id not in result["labels"]

def test_label_truncation():
    long_doc = "This is a long summary. " * 100
    example_long = {**example, "docstring": long_doc}
    result = preprocess(example_long)
    assert len(result["labels"]) == max_target_length

In [None]:
test_whitespace_normalization()
test_tokenization_keys()
test_input_truncation()
test_label_padding_masking()
test_label_truncation()