In [1]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from itertools import chain
import torch
import json
import gc
import os

In [2]:
checkpoint = "Salesforce/codet5p-220m-bimodal"
device = "cuda" if torch.cuda.is_available() else "cpu"
path_dataset = r'../data/intellij-train.csv'
path_to_save = r'../datasets/'
filename = 'intellij-train-dataset'

In [3]:
gc.collect()

62

In [4]:
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)

In [5]:
dataset = load_dataset('csv', data_files=path_dataset)['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
dataset = dataset.rename_columns({
    'code': 'encoder_input_text',
    'name': 'target_text'
})

In [7]:
# remove the method name from code
def remove(example):
    example['encoder_input_text'] = \
        example['encoder_input_text'].replace(example['target_text'], tokenizer.sep_token, 1)
    return example

In [8]:
dataset = dataset.map(remove)

Map:   0%|          | 0/102629 [00:00<?, ? examples/s]

In [9]:
# add the special token [TDEC] for code-to-text generation
def add_tdec(example):
    example['decoder_input_text'] = '[TDEC]'
    return example

In [10]:
dataset = dataset.map(add_tdec)

Map:   0%|          | 0/102629 [00:00<?, ? examples/s]

In [11]:
def add_prefix(example):
    example['target_text'] = 'The name of the method is: ' + example['target_text']
    return example

In [12]:
dataset = dataset.map(add_prefix)

Map:   0%|          | 0/102629 [00:00<?, ? examples/s]

In [13]:
def concat_texts(example):
    example['stacked_text'] = [text for text in example.values()]
    return example

In [14]:
dataset = \
    dataset.map(concat_texts, remove_columns=['encoder_input_text', 'decoder_input_text', 'target_text'])

Map:   0%|          | 0/102629 [00:00<?, ? examples/s]

In [15]:
def tokenize_batch_stacked_text(batch):
    stacked_texts = list(chain.from_iterable(batch['stacked_text']))

    tokenized_stacked_texts = \
        tokenizer(stacked_texts, truncation=True, padding='longest', max_length=256, return_tensors='pt')

    tokenized_stacked_texts = \
        {k: v.reshape(-1, 3, v.shape[1]) for k, v in tokenized_stacked_texts.items()}

    model_inputs = {
        "input_ids": tokenized_stacked_texts["input_ids"][:, 0, :].tolist(),
        "attention_mask": tokenized_stacked_texts["attention_mask"][:, 0, :].tolist(),
        "decoder_input_ids": tokenized_stacked_texts["input_ids"][:, 2, :].tolist(),
        "decoder_attention_mask": tokenized_stacked_texts["attention_mask"][:, 2, :].tolist(),
        "labels": tokenized_stacked_texts["input_ids"][:, 1, :]
    }
    model_inputs['labels'][model_inputs['labels'] == 0] = -100
    model_inputs['labels'] = model_inputs['labels'].tolist()
    return model_inputs

In [16]:
batch_size = 8

In [17]:
dataset = dataset \
    .shuffle(seed=42) \
    .map(tokenize_batch_stacked_text, batched=True, batch_size=batch_size, drop_last_batch=True, remove_columns=['stacked_text']) \
    .select(range(len(dataset) // batch_size))

Map:   0%|          | 0/102624 [00:00<?, ? examples/s]

In [18]:
dataset.to_json(os.path.join(path_to_save, filename + '.jsonl'))
# create json file where we put batch_size and max_length
with open(os.path.join(path_to_save, filename + '.json'), 'w') as f:
    json.dump({'batch_size': batch_size, 'max_length': 256}, f)

Creating json from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]