In [2]:
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
import regex as re
import pandas as pd
import pathlib
import gc
import os

checkpoint = "Salesforce/codet5p-220m-bimodal"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

In [None]:
def split_camel_case(method_name):
    pattern = re.compile(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])')
    words = re.split(pattern, method_name)
    return ' '.join(words)

In [3]:
class IntellijCodeDataset(Dataset):

    def __init__(self, path_to_data, tokenizer, max_length=512):
        self.path_to_data = path_to_data
        self.tokenizer = tokenizer
        self.max_length = max_length

        df = pd.read_csv(path_to_data)
        self.methods = df["method"][0:10].tolist()
        self.method_names = df["method_name"][0:10].tolist()

    def __len__(self):
        return len(self.method_names)

    def __getitem__(self, index):
        m = self.methods[index]
        m_n = self.method_names[index]

        # encode method
        m = m.replace(m_n, self.tokenizer.sep_token, 1)
        model_inputs = self.tokenizer(m, return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")

        # encode method name and set as target
        labels = tokenizer(split_camel_case(m_n), return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100
        model_inputs["labels"] = labels

        # decoder inputs
        decoder_inputs = self.tokenizer(["[TDEC]"], return_tensors="pt", max_length=self.max_length, truncation=True, padding="max_length")
        model_inputs["decoder_input_ids"] = decoder_inputs["input_ids"]
        model_inputs["decoder_attention_mask"] = decoder_inputs["attention_mask"]

        return model_inputs

In [None]:
path_to_save = "../experiments/"
path_to_data = "../data/train.csv"
model_save_name = "codet5p-220m-bimodal"
batch_size = 1
epochs = 1

In [4]:
torch.cuda.empty_cache()
gc.collect()

p = pathlib.Path(path_to_save)
p.mkdir(parents=True, exist_ok=True)

model.train()
dataset = IntellijCodeDataset(path_to_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs):
    pba = tqdm(dataloader)
    i = 0
    for batch in pba:
        optimizer.zero_grad()
        # move batch to device
        input_ids = batch["input_ids"].squeeze(1).to(device)
        attention_mask = batch["attention_mask"].squeeze(1).to(device)
        labels = batch["labels"].squeeze(1).to(device)
        decoder_input_ids = batch["decoder_input_ids"].squeeze(1).to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].squeeze(1).to(device)
        # inference
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask
        )
        # compute loss and backprop
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        # update progress bar
        pba.set_description("Epoch: " + str(epoch) + " Batch: " + str(i) + f" Loss: {loss.item():.4f}")
        i += 1

    # save model at end of epoch
    model.save_pretrained(os.path.join(path_to_save, model_save_name + "-" + str(epoch)))


  0%|          | 0/10 [00:16<?, ?it/s]


KeyboardInterrupt: 