In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
import torch
import json
import os

In [None]:
checkpoint = "Salesforce/codet5p-220m"
device = "cuda" if torch.cuda.is_available() else "cpu"
path_dataset = r''
path_to_save = r''
filename = 'intellij-test-dataset'

In [None]:
dataset = load_dataset('csv', data_files=path_dataset)['train']

In [None]:
# replace the method name in code with <extra_id_0> token for mask filling
def remove(example):
    example['code'] = example['code'].replace(example['name'], '<extra_id_0>')
    return example

In [None]:
dataset = dataset.map(remove)

In [None]:
def modify_target(example):
    example['labels'] = ' '.join(['<extra_id_0>'] + [example['name']] + ['<extra_id_1>'])
    return example

In [None]:
dataset = dataset.map(modify_target, remove_columns=['name'])

In [None]:
batch_size = 8
max_length = 256

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)

- To enhance computational efficiency, the dataset will be preprocessed into batches. Each list of ids will be padded to the maximum length of the batch.
- It is crucial not to shuffle the dataset after this preprocessing step. The order of examples is intricately linked to their length. Maintaining this order is particularly important when selecting a number that evenly divides the batch size. This ensures that batches of the same length are created, facilitating the use of data loaders.

In [None]:
def batch_tokenize_function(batch):
    model_inputs = tokenizer(batch['code'], padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    labels = tokenizer(batch['labels'], padding=True, truncation=True, max_length=max_length, return_tensors='pt').input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs['labels'] = labels

    return model_inputs

In [None]:
dataset = dataset\
    .shuffle()\
    .map(batch_tokenize_function, batched=True, batch_size=batch_size, remove_columns=['code']).with_format('torch')\
    .select(range(len(dataset) - len(dataset) % batch_size))

In [None]:
dataset.to_json(os.path.join(path_to_save, filename + '.jsonl'))

In [None]:
with open(os.path.join(path_to_save, filename + '.json'), 'w') as f:
    json.dump({'batch_size': batch_size, 'max_length': max_length}, f)