In [None]:
# Install required dependencies
!pip install datasets
!pip install evaluate
!pip install transformers -U
!pip install accelerate -U
!pip install rouge_score
!pip install fastparquet

In [2]:
import re
import pandas as pd


def format_text_for_code_gen(row):
    text = ''

    # Add annotations to text
    if not pd.isna(row.access_modifiers_annotation):
        for annotation in row.access_modifiers_annotation.split(', '):
            text += f"{annotation}\n"

    # Add test-annotations (if there are any) to text
    if not pd.isna(row.access_modifiers_test):
        for annotation in row.access_modifiers_test.split(', '):
            text += f"{annotation}\n"

    # Create signature for function (access modifiers, type identifier, formal parameters) and add it to text
    parameters = '' if pd.isna(row.formal_parameters) else row.formal_parameters
    signature = f"{row.access_modifiers.replace(', ', ' ')} {row.type_identifier} <extra_id_0>( {parameters} ) "
    text += signature

    # Add code block without comments to text
    if not pd.isna(row.block):
        # Remove comments
        code_block = re.sub(r'/\*.*?\*/', '', str(row.block), flags=re.DOTALL)
        code_block = re.sub(r'//.*?\n', '', code_block)
        text += code_block
    else:
        text += ';'

    return text

In [4]:
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

checkpoint = "Salesforce/codet5p-220m"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

max_input_length = 512
max_target_length = 20

# You need to upload this file in the /content folder in Google Colab
processed_io_df = pd.read_parquet('/content/functions_df_inputs_outputs.parquet.gz', engine='fastparquet').sample(frac=1, random_state=42)
print(processed_io_df.shape)

(402873, 6)


In [5]:
multiline_functions_df = processed_io_df[(processed_io_df.is_multiline == True) & (processed_io_df.is_test == False)]\
    .reset_index(drop=True)

abstract_functions_df = processed_io_df[(processed_io_df.is_abstract == True) & (processed_io_df.is_test == False)].reset_index(drop=True)

tests_df = processed_io_df[processed_io_df.is_test == True].reset_index(drop=True)

one_liners_df = processed_io_df[~processed_io_df['function_id'].isin(multiline_functions_df['function_id']) &
                                ~processed_io_df['function_id'].isin(abstract_functions_df['function_id']) &
                                ~processed_io_df['function_id'].isin(tests_df['function_id'])].reset_index(drop=True)

print(multiline_functions_df.shape)
print(abstract_functions_df.shape)
print(tests_df.shape)
print(one_liners_df.shape)
print(multiline_functions_df.shape[0] + abstract_functions_df.shape[0] + tests_df.shape[0] + one_liners_df.shape[0])

(221798, 6)
(10276, 6)
(48102, 6)
(122698, 6)
402874


In [6]:
processed_io_df = pd.concat([multiline_functions_df[:50000],
                             abstract_functions_df[:3000],
                             tests_df[:7000],
                             one_liners_df[:40000]]).sample(frac=1, random_state=42)
print(processed_io_df.shape)

(100000, 6)


In [7]:
def preprocess_function(dataframe):
    model_inputs = {'input_ids': [], 'attention_mask': [], 'labels': []}
    for index in tqdm(range(dataframe.shape[0])):
        row = dataframe.iloc[index]
        input_ids = tokenizer(row.input, truncation=True, max_length=max_input_length, padding="max_length")
        label = tokenizer(row.label, truncation=True, max_length=max_target_length, padding="max_length")
        model_inputs['input_ids'].append(input_ids['input_ids'])
        model_inputs['attention_mask'].append(input_ids['attention_mask'])
        model_inputs['labels'].append(label['input_ids'])
    return model_inputs

# processed_io_df_wo_abstract = processed_io_df[(processed_io_df.is_abstract == False)].reset_index(drop=True)
model_inputs = preprocess_function(processed_io_df)
dataset = Dataset.from_dict(model_inputs)

dataset_train_test = dataset.train_test_split(test_size=0.2, seed=42)
dataset_splits = DatasetDict({
    'train': dataset_train_test['train'],
    'test': dataset_train_test['test']
    })

100%|██████████| 100000/100000 [01:29<00:00, 1123.21it/s]


In [8]:
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_metric, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
import nltk
import torch

checkpoint = "Salesforce/codet5p-220m"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

train = dataset_splits['train']
test = dataset_splits['test']

batch_size = 8
model_name = "codet5p-220m-function-name-generation"
model_dir = f"./{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=10000,
    logging_strategy="steps",
    logging_steps=10000,
    save_strategy="steps",
    save_steps=10000,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
)

In [9]:
import nltk
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data_collator = DataCollatorForSeq2Seq(tokenizer)

# Setup evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result


# Function that returns an untrained model to be trained
def model_init():
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    model = model.to('cuda')
    return model
    # return AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [12]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/Fine tuning"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# %tensorboard --logdir logs

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

# Start TensorBoard before training to monitor it in progress
trainer.train()

trainer.save_model("/content/drive/My Drive/Fine tuning/model")

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/446M [00:00<?, ?B/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
10000,0.4816,0.324875,0.445009,0.005775,0.445053,0.444882
20000,0.2641,0.303348,0.471647,0.005897,0.471859,0.471663
30000,0.1944,0.306891,0.483316,0.00599,0.483377,0.483229


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
