In [4]:
from datasets import load_dataset
import torch
# Load dataset
dataset = load_dataset("google/code_x_glue_ct_code_to_text", "java")

# Reduce size for faster iteration
import random
train_data = dataset["train"].select(random.sample(range(len(dataset["train"])), k=int(1.0 * len(dataset["train"]))))
val_data = dataset["validation"].select(random.sample(range(len(dataset["validation"])), k=int(1.0 * len(dataset["validation"]))))
print("Train size:", len(train_data))
print("Validation size:", len(val_data))


  from .autonotebook import tqdm as notebook_tqdm


Train size: 164923
Validation size: 5183


In [5]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '<pad>', 'sep_token': '<sep>', 'bos_token': '<s>', 'eos_token': '</s>'})


4

In [6]:
def preprocess(example):
    code = example["code"]
    docstring = example["docstring"]
    full_text = f"<s> {code} </s> <sep> {docstring}"

    # Tokenize the full string
    tokens = tokenizer(
        full_text,
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Copy labels from input_ids
    labels = tokens["input_ids"][:]

    # Mask out the code portion
    sep_id = tokenizer.convert_tokens_to_ids("<sep>")
    try:
        sep_index = labels.index(sep_id)
    except ValueError:
        sep_index = 0  # fallback: mask entire sequence

    labels[:sep_index + 1] = [-100] * (sep_index + 1)
    tokens["labels"] = labels

    return tokens
train_data = train_data.map(preprocess, remove_columns=train_data.column_names)
val_data = val_data.map(preprocess, remove_columns=val_data.column_names)

# Convert to PyTorch tensors
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|███████████████████████████████████████████████████████████████████████████| 164923/164923 [02:46<00:00, 989.97 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████| 5183/5183 [00:05<00:00, 964.31 examples/s]


In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import os

# Load and resize the pretrained model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize for new tokens

# Optional: Enable progress bar
os.environ["WANDB_DISABLED"] = "true"  # Disable WandB if accidentally enabled
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-docstring",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,                       
    eval_strategy="epoch",                    
    save_strategy="epoch",                    
    logging_dir="./logs",                      
    logging_steps=1,                          
    disable_tqdm=False,                       
    report_to="none",                         
    logging_first_step=True
)

# Data collator to handle padding and masking
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)



if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# ✅ Start training with progress bar
trainer.train()


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


NameError: name 'torch' is not defined

In [7]:
# Save model and tokenizer
model.save_pretrained("./gpt2-docstring-model")
tokenizer.save_pretrained("./gpt2-docstring-model")


NameError: name 'model' is not defined

In [52]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-docstring-model")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-docstring-model")


In [2]:
def generate_docstring_few_shot(test_code, max_length=200):
    few_shot_prompt = """
<s> public int add(int a, int b) { return a + b; } </s> <sep> Adds two integers and returns the sum.

<s> public int multiply(int a, int b) { return a * b; } </s> <sep> Multiplies two integers and returns the product.

<s> public boolean isEven(int num) { return num % 2 == 0; } </s> <sep> Checks if a number is even.

<s> public String greet(String name) { return "Hello " + name; } </s> <sep> Greets the user by name.
"""

    # Append the new example
    prompt = few_shot_prompt.strip() + f"\n<s> {test_code} </s> <sep>"

    # Tokenize prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_len = input_ids.shape[1]

    # Generate continuation from after the prompt
    output_ids = model.generate(
        input_ids,
        max_length=input_len + 50,  # buffer for generation
        num_beams=9,
        no_repeat_ngram_size=4,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Only decode the newly generated tokens (after prompt)
    generated_ids = output_ids[0][input_len:]  # exclude prompt
    result = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return result


In [3]:
# java_code = "public boolean isEqual(int a, int b) { return a==b; }"
java_code ="public int subtract(int a, int b) { return a - b; }"

docstring = generate_docstring_few_shot(java_code)
print("Generated docstring:\n", docstring)


NameError: name 'tokenizer' is not defined