In [1]:
import os
import numpy as np
import pandas as pd

from pathlib import Path
from datasets import load_dataset, Dataset

from data_processing.preprocessors.preprocessor import PreprocessingPipeline
from data_processing.preprocessors.preprocessor import RemoveComments
from data_processing.preprocessors.preprocessor import SyntaxValidator
from data_processing.preprocessors.preprocessor import DuplicateFilter
from data_processing.preprocessors.preprocessor import Pep8Formatter

from data_processing.pretokenizers.firstpretokenizer import FirstPretokenizer

from data_processing.segmentators.ultimatesegmentator import UltimateSegmentator

from data_processing.utils.pretokenize_all import pretokenize_all
from data_processing.utils.docstring_and_code_filtering import doctring_and_code_filtering
from data_processing.utils.data_loader import load_and_split_dataset
from data_processing.utils.data_preparation import preprocess

from model_operations.utils.model_utils import load_tokenizer

from config import (
    RUN_SEGEMENTATOR,
    MODEL_NAME,
    MAX_INPUT_LENGTH,
    MAX_OUTPUT_LENGTH,
    USE_CUSTOM_EOS,
    EOS,
)

  from .autonotebook import tqdm as notebook_tqdm


/home/patryk/Documents/syntax-aware-language-model-for-code-generation/model_operations/training/models/t5-base-split20-epochs2-lossTrue


2025-05-20 00:56:02.591318: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747695362.611654   63662 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747695362.617671   63662 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747695362.633374   63662 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747695362.633395   63662 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747695362.633397   63662 computation_placer.cc:177] computation placer alr

In [2]:
access_token = os.getenv("HUGGINGFACE_TOKEN")

# Intro

This is a simple yet powerfull step by step guideline for reproducing our results. The repository contains code for experimenting with a custom training method for the T5 language model, aimed at improving its performance in code generation tasks.

# Dataset preperation

Download and truncate the raw dataset to the desired size.

In [3]:
dataset = load_dataset("Nan-Do/code-search-net-python", token=access_token)["train"]
dataset = dataset.map(lambda x: {**x, "code_length": len(x["code"])})
dataset = dataset.sort("code_length", reverse=False)

In [4]:
code_lengths = dataset["code_length"]

print("Statistics of 'code_length' column:")
df = pd.DataFrame(code_lengths)
print(df.describe())

Statistics of 'code_length' column:
                   0
count  455243.000000
mean     1058.562045
std      1412.836589
min        75.000000
25%       397.000000
50%       666.000000
75%      1208.000000
max    103665.000000


In [5]:
dataset = dataset.select(range(len(dataset) // 15))
print("Number of samples before cleaning:", len(dataset))

Number of samples before cleaning: 30349


Clean the samples by removing comments, formatting them to the PEP8 standard, and getting rid of syntactically incorrect samples and duplicates.

In [6]:
pipeline = PreprocessingPipeline([
    RemoveComments(),
    DuplicateFilter(),
    Pep8Formatter(),
    SyntaxValidator()
])

data = pipeline.apply(dataset)

print("Number of samples after cleaning:", len(data))

Number of samples after cleaning: 30074


Let's do the same thing for another, high quality dataset adn then concatinate them

Load and apply the pretokenizer.

The pretokenizer is simply an AST visitor that walks through the code structure and changes its string representation to match special tokens in the tokenizer, so they can be detected and translated into the proper token IDs

In [None]:
pretokenizer = FirstPretokenizer(_use_dedent=True, _use_semantics=True)
data = pretokenize_all(data, pretokenizer)

We need tokenizer for segmentation process and for a later use. We need to **load a tokenizer**.

In [8]:
tokenizer, specifics = load_tokenizer(MODEL_NAME, USE_CUSTOM_EOS, pretokenizer)
if specifics:
    semantic_start_id, semantic_end_id, code_token_ids, semantic_token_ids = specifics

Load and apply the semgmentator if specified.

It allows for "masking" and helps the model capture local dependencies. It replaces each sample with a few new ones that have consistent code fragments masked out, and the labels are adjusted accordingly

In [10]:
if RUN_SEGEMENTATOR:
    segmentator = UltimateSegmentator(pretokenizer)
    data = segmentator.apply(data, tokenizer)

Add custom eos token if specified.

In [None]:
if USE_CUSTOM_EOS:
    for sample in data:
        sample['parsed'] += EOS

Let's filter out unnecessary columns.

In [None]:
data = doctring_and_code_filtering(data)

Samples with ≤512 tokens: 224133 / 224133 (100.00%)


We can save the results to use them in later external experiments.

In [None]:
import json
with open("50_smallest_docstring_and_code.jsonl", 'w') as f:
    for sample in data:
        f.write(json.dumps(sample) + '\n') 

Split the data into subsets.

In [None]:
dataset_dict = load_and_split_dataset(data)

{'train': Dataset({
     features: ['docstring', 'parsed'],
     num_rows: 748
 }),
 'validation': Dataset({
     features: ['docstring', 'parsed'],
     num_rows: 94
 }),
 'test': Dataset({
     features: ['docstring', 'parsed'],
     num_rows: 94
 })}

The last step to complete data preparation is to adjust the sample length to the model's context window.

Perform tokenization and length adjustments.

In [None]:
tokenized_dataset = {
    split: dataset.map(
        lambda batch: preprocess(batch, tokenizer, USE_CUSTOM_EOS, MAX_INPUT_LENGTH, MAX_OUTPUT_LENGTH),
        batched=True,
        remove_columns=dataset.column_names
    )
    for split, dataset in dataset_dict.items()
}

Map:   0%|          | 0/748 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/94 [00:00<?, ? examples/s]