In [None]:
import os

from pathlib import Path
from datasets import load_dataset, Dataset

from data_processing.preprocessors.preprocessor import PreprocessingPipeline
from data_processing.preprocessors.preprocessor import RemoveComments
from data_processing.preprocessors.preprocessor import SyntaxValidator
from data_processing.preprocessors.preprocessor import DuplicateFilter
from data_processing.preprocessors.preprocessor import Pep8Formatter

from data_processing.pretokenizers.firstpretokenizer import FirstPretokenizer

from data_processing.utils.pretokenize_all import pretokenize_all
from data_processing.utils.docstring_and_code_filtering import doctring_and_code_filtering
from data_processing.utils.data_loader import load_and_split_dataset
from data_processing.utils.data_preparation import preprocess

from training.utils.model_utils import load_tokenizer

from config import (
    RUN_SEGEMENTATOR,
    MODEL_NAME,
    MAX_INPUT_LENGTH,
    MAX_OUTPUT_LENGTH
)

In [None]:
access_token = os.getenv("HUGGINGFACE_TOKEN")

# Intro

This is a simple yet powerfull step by step guideline for reproducing our results. The repository contains code for experimenting with a custom training method for the T5 language model, aimed at improving its performance in code generation tasks.

# Dataset preperation

Download and truncate the raw dataset to the desired size.

In [10]:
dataset = load_dataset("Nan-Do/code-search-net-python", token=access_token)["train"]
dataset = dataset.select(range(10))
print("Number of samples before cleaning:", len(dataset))

Number of samples before cleaning: 10


Clean the samples by removing comments, formatting them to the PEP8 standard, and getting rid of syntactically incorrect samples and duplicates.

In [11]:
pipeline = PreprocessingPipeline([
    RemoveComments(),
    DuplicateFilter(),
    Pep8Formatter(),
    SyntaxValidator()
])

data = pipeline.apply(dataset)

print("Number of samples after cleaning:", len(data))

Number of samples after cleaning: 10


Load and apply the pretokenizer.

The pretokenizer is simply an AST visitor that walks through the code structure and changes its string representation to match special tokens in the tokenizer, so they can be detected and translated into the proper token IDs

In [None]:
pretokenizer = FirstPretokenizer(_use_dedent=True, _use_semantics=True)
data = pretokenize_all(data, pretokenizer)

Load and apply the semgmentator.

It allows for "masking" and helps the model capture local dependencies. It replaces each sample with a few new ones that have consistent code fragments masked out, and the labels are adjusted accordingly

In [None]:
if RUN_SEGEMENTATOR:
    data = ...

Let's filter out unnecessary columns.

In [14]:
data = doctring_and_code_filtering(data)

Samples with ≤512 tokens: 10 / 10 (100.00%)


Split the data into subsets.

In [17]:
dataset_dict = load_and_split_dataset(data)
dataset_dict

{'train': Dataset({
     features: ['docstring', 'parsed'],
     num_rows: 8
 }),
 'validation': Dataset({
     features: ['docstring', 'parsed'],
     num_rows: 1
 }),
 'test': Dataset({
     features: ['docstring', 'parsed'],
     num_rows: 1
 })}

The last step to complete data preparation is to adjust the sample length to the model's context window. We need to **load a tokenizer**.

In [None]:
tokenizer, specifics = load_tokenizer(MODEL_NAME, pretokenizer)
semantic_start_id, semantic_end_id, code_token_ids, semantic_token_ids = specifics

Perform tokenization and length adjustments.

In [None]:
tokenized_dataset = {
    split: dataset.map(
        lambda batch: preprocess(batch, tokenizer, MAX_INPUT_LENGTH, MAX_OUTPUT_LENGTH),
        batched=True,
        remove_columns=dataset.column_names
    )
    for split, dataset in dataset_dict.items()
}