In [None]:
from pathlib import Path

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer

In [None]:
root_dir = Path.cwd().parent

In [None]:
datasets = load_dataset("json", data_files={
    "train": str(root_dir / "data" / "train.jsonl")
}, cache_dir=root_dir / "data" / "cache", split="train")

In [5]:
# datasets = load_dataset("code_x_glue_ct_code_to_text", 'python', split="train")
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-220m")

def preprocess_function(examples):
    source = [' '.join(ex) for ex in examples["code_tokens"]]
    target = [' '.join(ex) for ex in examples["docstring_tokens"]]

    model_inputs = tokenizer(source, max_length=320, padding="max_length", truncation=True)
    labels = tokenizer(target, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"].copy()
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
    ]
    return model_inputs

train_data = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=datasets.column_names,
    num_proc=64,
    load_from_cache_file=False,
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5p-220m")
model.to("cuda")

training_args = TrainingArguments(
    output_dir=root_dir / "modeling" / "models" / "codet5p_220m",
    overwrite_output_dir=False,

    do_train=True,
    save_strategy='epoch',

    num_train_epochs=2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,

    learning_rate=5e-5,
    weight_decay=0.05,
    warmup_steps=200,

    logging_dir=root_dir / "modeling" / "models" / "codet5p_220m",
    logging_first_step=True,
    logging_steps=100,
    save_total_limit=1,

    dataloader_drop_last=True,
    dataloader_num_workers=4,

    local_rank=-1,
    deepspeed=None,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(root_dir / "modeling" / "models" / "codet5p_220m")